[llvm] [AMDGPU] Add rotate/funnel shift pattern matching in instruction selection (PR #149817)
Aleksandar Spasojevic via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 21 06:50:43 PDT 2025
https://github.com/aleksandar-amd created https://github.com/llvm/llvm-project/pull/149817
This patch implements pattern recognition for rotate and funnel shift operations
in instruction selection pass, converting shift+OR sequences back to efficient
V_ALIGNBIT_B32 instructions. Made ROTR and FSHR non-legal to force expansion
into shift sequences, allowing divergence-aware instruction selection to choose
optimal instructions.
>From 19b6a58409330ab4e2a3f1e4380b0fae62a773c8 Mon Sep 17 00:00:00 2001
From: Aleksandar Spasojevic <aleksandar.spasojevic at amd.com>
Date: Tue, 10 Jun 2025 17:17:39 +0200
Subject: [PATCH] [AMDGPU] Optimize rotate instruction selection patterns
This patch improves rotate instruction selection for AMDGPU by adding
optimized patterns for the rotate right (rotr) operation. It now selects
s_lshl + s_lshr + s_or (3 SALU instructions) instead of the previous
v_alignbit + v_readfirstlane (2 VALU instructions).
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 169 +
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 227 +
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 1 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 5 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 +
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 971 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 352 +-
.../AMDGPU/GlobalISel/inst-select-fshr.mir | 41 -
.../AMDGPU/GlobalISel/legalize-fshl.mir | 111 +-
.../AMDGPU/GlobalISel/legalize-fshr.mir | 102 +-
.../AMDGPU/GlobalISel/legalize-rotl-rotr.mir | 79 +-
.../AMDGPU/GlobalISel/regbankselect-fshr.mir | 168 -
.../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 92 +-
.../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 92 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 35946 ++++++++--------
.../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 1474 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 3174 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 268 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll | 36 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 7970 ++--
.../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 698 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 822 +-
.../CodeGen/AMDGPU/any_extend_vector_inreg.ll | 66 +-
.../atomic_optimizations_global_pointer.ll | 38 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 2871 +-
llvm/test/CodeGen/AMDGPU/bswap.ll | 69 +-
.../buffer-fat-pointer-atomicrmw-fadd.ll | 378 +-
.../buffer-fat-pointer-atomicrmw-fmax.ll | 182 +-
.../buffer-fat-pointer-atomicrmw-fmin.ll | 182 +-
.../build-vector-packed-partial-undef.ll | 8 +-
llvm/test/CodeGen/AMDGPU/build_vector.ll | 4 +-
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 28 +-
.../AMDGPU/divergence-driven-buildvector.ll | 12 +-
llvm/test/CodeGen/AMDGPU/fabs.bf16.ll | 50 +-
llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 252 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 396 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 338 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 338 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 264 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll | 93 +-
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 14 +-
llvm/test/CodeGen/AMDGPU/fneg.bf16.ll | 37 +-
llvm/test/CodeGen/AMDGPU/freeze.ll | 18 +-
llvm/test/CodeGen/AMDGPU/fshl.ll | 597 +-
llvm/test/CodeGen/AMDGPU/fshr.ll | 1012 +-
llvm/test/CodeGen/AMDGPU/function-args.ll | 94 +-
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 672 +-
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 508 +-
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 508 +-
.../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 384 +-
llvm/test/CodeGen/AMDGPU/idot4u.ll | 48 +-
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 53 +-
.../llvm.amdgcn.raw.ptr.buffer.store.bf16.ll | 43 +-
.../llvm.amdgcn.raw.ptr.buffer.store.ll | 17 +-
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 724 +-
llvm/test/CodeGen/AMDGPU/load-global-i8.ll | 867 +-
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 192 +-
.../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 280 +-
.../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 280 +-
.../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 192 +-
llvm/test/CodeGen/AMDGPU/packetizer.ll | 3 +
llvm/test/CodeGen/AMDGPU/permute.ll | 4 +-
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 256 +-
llvm/test/CodeGen/AMDGPU/rotate-add.ll | 54 +-
llvm/test/CodeGen/AMDGPU/rotl.ll | 204 +-
llvm/test/CodeGen/AMDGPU/rotr.ll | 277 +-
llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 18 +-
.../SPIRV/generic-cast-explicit.ll | 15 +-
70 files changed, 34913 insertions(+), 30842 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir
delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fshr.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 00c7f0eb6e9f1..c61f3a54ec2b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -820,6 +820,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectSTACKRESTORE(N);
return;
}
+ case ISD::OR: {
+ if (SDNode *Selected = selectRotateOrFunnelShiftPattern(N)) {
+ ReplaceNode(N, Selected);
+ return;
+ }
+ break;
+ }
}
SelectCode(N);
@@ -4105,6 +4112,168 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
} while (IsModified);
}
+// Pattern matching for rotate/funnel shift operations
+// and converts them to v_alignbit_b32 instructions
+SDNode *AMDGPUDAGToDAGISel::selectRotateOrFunnelShiftPattern(SDNode *N) {
+ if (N->getOpcode() != ISD::OR)
+ return nullptr;
+
+ // Only handle 32-bit operations
+ if (N->getValueType(0) != MVT::i32)
+ return nullptr;
+
+ if (!N->isDivergent())
+ return nullptr;
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ SDNode *ShlNode = nullptr;
+ SDNode *SrlNode = nullptr;
+
+ // Check both orderings: (shl, srl) and (srl, shl)
+ bool IsLHSShl = LHS.getOpcode() == ISD::SHL;
+ bool IsRHSSrl = RHS.getOpcode() == ISD::SRL;
+ bool IsLHSSrl = LHS.getOpcode() == ISD::SRL;
+ bool IsRHSShl = RHS.getOpcode() == ISD::SHL;
+
+ if ((IsLHSShl && IsRHSSrl) || (IsLHSSrl && IsRHSShl)) {
+ ShlNode = IsLHSShl ? LHS.getNode() : RHS.getNode();
+ SrlNode = IsRHSSrl ? RHS.getNode() : LHS.getNode();
+ } else {
+ return nullptr;
+ }
+
+ // Extract sources and shift amounts
+ SDValue ShlSrc = ShlNode->getOperand(0);
+ SDValue ShlAmt = ShlNode->getOperand(1);
+ SDValue SrlSrc = SrlNode->getOperand(0);
+ SDValue SrlAmt = SrlNode->getOperand(1);
+
+ // Handle the legalizer's (src << 1) pattern for SHL source
+ if (ShlSrc.getOpcode() == ISD::SHL)
+ if (ConstantSDNode *PreShlAmt =
+ dyn_cast<ConstantSDNode>(ShlSrc.getOperand(1)))
+ if (PreShlAmt->getZExtValue() == 1)
+ ShlSrc = ShlSrc.getOperand(0);
+
+ // Helper function to build AlignBit instruction
+ auto buildAlignBitInstruction = [&](SDValue AlignBitSrc0,
+ SDValue AlignBitSrc1,
+ SDValue ShiftAmount) -> SDNode * {
+ SDLoc DL(N);
+
+ // Select opcode based on subtarget features
+ const GCNSubtarget &ST = CurDAG->getSubtarget<GCNSubtarget>();
+ unsigned Opcode =
+ ST.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? (ST.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+ : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+ : ST.hasTrue16BitInsts()
+ ? (ST.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+ : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+ : AMDGPU::V_ALIGNBIT_B32_e64;
+
+ SDValue Ops[8]; // Maximum operands needed
+ unsigned NumOps = 0;
+
+ if (Opcode == AMDGPU::V_ALIGNBIT_B32_t16_e64 ||
+ Opcode == AMDGPU::V_ALIGNBIT_B32_fake16_e64) {
+ // Extended format with modifiers
+ Ops[0] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src0_modifiers
+ Ops[1] = AlignBitSrc0; // src0
+ Ops[2] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src1_modifiers
+ Ops[3] = AlignBitSrc1; // src1
+ Ops[4] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src2_modifiers
+ Ops[5] = ShiftAmount; // src2
+ Ops[6] = CurDAG->getTargetConstant(0, DL, MVT::i32); // clamp
+ Ops[7] = CurDAG->getTargetConstant(0, DL, MVT::i32); // op_sel
+ NumOps = 8;
+ } else {
+ // Regular e64 format
+ Ops[0] = AlignBitSrc0;
+ Ops[1] = AlignBitSrc1;
+ Ops[2] = ShiftAmount;
+ NumOps = 3;
+ }
+
+ return CurDAG->getMachineNode(Opcode, DL, MVT::i32,
+ ArrayRef<SDValue>(Ops, NumOps));
+ };
+
+ // Case 1: Both shift amounts are constants
+ ConstantSDNode *ShlConstant = dyn_cast<ConstantSDNode>(ShlAmt);
+ ConstantSDNode *SrlConstant = dyn_cast<ConstantSDNode>(SrlAmt);
+
+ if (ShlConstant && SrlConstant) {
+ int64_t ShlVal = ShlConstant->getSExtValue();
+ int64_t SrlVal = SrlConstant->getSExtValue();
+
+ if (ShlVal + SrlVal != 32)
+ return nullptr;
+
+ // Create constant for shift amount
+ SDLoc DL(N);
+ SDValue ConstAmtNode = CurDAG->getTargetConstant(SrlVal, DL, MVT::i32);
+
+ return buildAlignBitInstruction(ShlSrc, SrlSrc, ConstAmtNode);
+ }
+
+ // Helper to extract shift amount from (some_value & 31) pattern
+ auto getShiftAmount = [&](SDValue ShiftAmtVal) -> SDValue {
+ if (ShiftAmtVal.getOpcode() == ISD::AND)
+ if (ConstantSDNode *MaskNode =
+ dyn_cast<ConstantSDNode>(ShiftAmtVal.getOperand(1)))
+ if (MaskNode->getZExtValue() == 31)
+ return ShiftAmtVal.getOperand(0);
+
+ return SDValue();
+ };
+
+ // Case 2: Variable shift amounts - check the AND pattern
+ SDValue ShlAmtSrc = getShiftAmount(ShlAmt);
+ SDValue SrlAmtSrc = getShiftAmount(SrlAmt);
+
+ if (!ShlAmtSrc || !SrlAmtSrc)
+ return nullptr;
+
+ // Check if SHL amount comes from NOT or NEG of the original amount
+ SDValue OriginalAmt;
+ bool IsRotatePattern = false;
+
+ if (ShlAmtSrc.getOpcode() == ISD::XOR) {
+ // FSHR pattern: SHL amount = (~original_amt) & 31
+ if (ConstantSDNode *XorMask =
+ dyn_cast<ConstantSDNode>(ShlAmtSrc.getOperand(1))) {
+ if (XorMask->getSExtValue() == -1) {
+ if (ShlAmtSrc.getOperand(0) == SrlAmtSrc) {
+ OriginalAmt = SrlAmtSrc;
+ IsRotatePattern = false;
+ }
+ }
+ }
+ } else if (ShlAmtSrc.getOpcode() == ISD::SUB) {
+ // ROTR pattern: SHL amount = (-original_amt) & 31 = (0 - original_amt) & 31
+ if (ConstantSDNode *SubLHS =
+ dyn_cast<ConstantSDNode>(ShlAmtSrc.getOperand(0))) {
+ if (SubLHS->getZExtValue() == 0) {
+ if (ShlAmtSrc.getOperand(1) == SrlAmtSrc) {
+ OriginalAmt = SrlAmtSrc;
+ IsRotatePattern = true;
+ }
+ }
+ }
+ }
+
+ if (!OriginalAmt)
+ return nullptr;
+
+ SDValue AlignBitSrc0 = ShlSrc;
+ SDValue AlignBitSrc1 = IsRotatePattern ? ShlSrc : SrlSrc;
+
+ return buildAlignBitInstruction(AlignBitSrc0, AlignBitSrc1, OriginalAmt);
+}
+
AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
CodeGenOptLevel OptLevel)
: SelectionDAGISelLegacy(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index acbab3d9e2d81..b73259054d581 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -284,6 +284,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectINTRINSIC_VOID(SDNode *N);
void SelectWAVE_ADDRESS(SDNode *N);
void SelectSTACKRESTORE(SDNode *N);
+ SDNode *selectRotateOrFunnelShiftPattern(SDNode *N);
protected:
// Include the pieces autogenerated from the target description.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b037cdd5393ea..49d122a91c7e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -486,12 +486,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
}
- // The hardware supports 32-bit FSHR, but not FSHL.
- setOperationAction(ISD::FSHR, MVT::i32, Legal);
+ if (Subtarget->isGCN()) {
+ setOperationAction(ISD::FSHR, MVT::i32, Expand);
+ setOperationAction(ISD::ROTR, {MVT::i32, MVT::i64}, Expand);
+ } else {
+ setOperationAction(ISD::FSHR, MVT::i32, Legal);
+ setOperationAction(ISD::ROTR, {MVT::i32, MVT::i64}, Legal);
+ }
// The hardware supports 32-bit ROTR, but not ROTL.
setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
- setOperationAction(ISD::ROTR, MVT::i64, Expand);
setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8975486caa770..78506d8976f22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -406,6 +406,231 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
+bool AMDGPUInstructionSelector::selectRotateOrFunnelShiftPattern(
+ MachineInstr &I) const {
+ Register DstReg = I.getOperand(0).getReg();
+ Register LHS = I.getOperand(1).getReg();
+ Register RHS = I.getOperand(2).getReg();
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ if (!IsVALU)
+ return false;
+
+ // Check if this is a 32-bit operation
+ if (MRI->getType(DstReg).getSizeInBits() != 32)
+ return false;
+
+ MachineInstr *LHSInst = getDefIgnoringCopies(LHS, *MRI);
+ MachineInstr *RHSInst = getDefIgnoringCopies(RHS, *MRI);
+
+ MachineInstr *ShlInst = nullptr;
+ MachineInstr *SrlInst = nullptr;
+
+ // Check both orderings: (shl, srl) and (srl, shl)
+ bool IsLHSShl = LHSInst->getOpcode() == TargetOpcode::G_SHL;
+ bool IsRHSSrl = RHSInst->getOpcode() == TargetOpcode::G_LSHR;
+ bool IsLHSSrl = LHSInst->getOpcode() == TargetOpcode::G_LSHR;
+ bool IsRHSShl = RHSInst->getOpcode() == TargetOpcode::G_SHL;
+
+ if ((IsLHSShl && IsRHSSrl) || (IsLHSSrl && IsRHSShl)) {
+ ShlInst = IsLHSShl ? LHSInst : RHSInst;
+ SrlInst = IsRHSSrl ? RHSInst : LHSInst;
+ } else
+ return false;
+
+ // Extract the base sources, handling the legalizer's (src << 1) pattern
+ Register ShlSrc = ShlInst->getOperand(1).getReg();
+ Register SrlSrc = SrlInst->getOperand(1).getReg();
+
+ // Check if SHL source comes from (original_src << 1)
+ MachineInstr *PreShlInst = getDefIgnoringCopies(ShlSrc, *MRI);
+ if (PreShlInst && PreShlInst->getOpcode() == TargetOpcode::G_SHL) {
+ std::optional<ValueAndVReg> PreShlAmt = getIConstantVRegValWithLookThrough(
+ PreShlInst->getOperand(2).getReg(), *MRI);
+ if (PreShlAmt && PreShlAmt->Value.getZExtValue() == 1)
+ ShlSrc = PreShlInst->getOperand(1).getReg();
+ }
+ // Helper function to build AlignBit instruction
+ auto buildAlignBitInstruction = [&](Register AlignBitSrc0,
+ Register AlignBitSrc1,
+ Register ShiftAmount) -> bool {
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ // Select opcode based on subtarget features
+ unsigned Opcode =
+ STI.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? (STI.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+ : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+ : STI.hasTrue16BitInsts()
+ ? (STI.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+ : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+ : AMDGPU::V_ALIGNBIT_B32_e64;
+
+ // Check constant bus restriction and copy SGPRs to VGPRs if needed
+ unsigned ConstantBusLimit = STI.getConstantBusLimit(Opcode);
+ unsigned SGPRCount = 0;
+
+ Register AlignBitSrc0ToUse = AlignBitSrc0;
+ Register AlignBitSrc1ToUse = AlignBitSrc1;
+ Register ShiftAmountToUse = ShiftAmount;
+
+ // Count SGPR operands
+ SGPRCount += (RBI.getRegBank(AlignBitSrc0, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID)
+ ? 1
+ : 0;
+ SGPRCount += (RBI.getRegBank(AlignBitSrc1, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID)
+ ? 1
+ : 0;
+ SGPRCount += (RBI.getRegBank(ShiftAmount, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID)
+ ? 1
+ : 0;
+
+ // If we exceed the constant bus limit, copy SGPRs to VGPRs
+ if (SGPRCount > ConstantBusLimit) {
+ auto copyToVGPRIfNeeded = [&](Register &RegToUse, Register OrigReg) {
+ if (RBI.getRegBank(OrigReg, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID &&
+ SGPRCount > ConstantBusLimit) {
+ RegToUse = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_MOV_B32_e32), RegToUse)
+ .addReg(OrigReg);
+ SGPRCount--;
+ }
+ };
+
+ copyToVGPRIfNeeded(AlignBitSrc0ToUse, AlignBitSrc0);
+ copyToVGPRIfNeeded(AlignBitSrc1ToUse, AlignBitSrc1);
+ copyToVGPRIfNeeded(ShiftAmountToUse, ShiftAmount);
+ }
+
+ auto AlignBit = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg);
+
+ if (Opcode == AMDGPU::V_ALIGNBIT_B32_t16_e64 ||
+ Opcode == AMDGPU::V_ALIGNBIT_B32_fake16_e64) {
+ // t16/fake16 variants have extended operand format
+ AlignBit
+ .addImm(0) // src0_modifiers
+ .addReg(AlignBitSrc0ToUse) // src0
+ .addImm(0) // src1_modifiers
+ .addReg(AlignBitSrc1ToUse) // src1
+ .addImm(0) // src2_modifiers
+ .addReg(ShiftAmountToUse) // src2
+ .addImm(0) // clamp
+ .addImm(0); // op_sel
+ } else {
+ AlignBit.addReg(AlignBitSrc0ToUse)
+ .addReg(AlignBitSrc1ToUse)
+ .addReg(ShiftAmountToUse);
+ }
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*AlignBit, TII, TRI, RBI);
+ };
+
+ // Get shift amounts for both SHL and SRL
+ Register ShlAmtReg = ShlInst->getOperand(2).getReg();
+ Register SrlAmtReg = SrlInst->getOperand(2).getReg();
+
+ // Case 1: Both shift amounts are constants (may be through COPY instructions)
+ auto ShlConstVal = getIConstantVRegValWithLookThrough(ShlAmtReg, *MRI);
+ auto SrlConstVal = getIConstantVRegValWithLookThrough(SrlAmtReg, *MRI);
+
+ if (ShlConstVal && SrlConstVal) {
+ int64_t ShlVal = ShlConstVal->Value.getSExtValue();
+ int64_t SrlVal = SrlConstVal->Value.getSExtValue();
+
+ if (ShlVal + SrlVal != 32)
+ return false;
+
+ // Create a constant register for the original shift amount (SRL amount)
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ Register ConstAmtReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), ConstAmtReg)
+ .addImm(SrlVal);
+
+ return buildAlignBitInstruction(ShlSrc, SrlSrc, ConstAmtReg);
+ }
+
+ // Helper to extract shift amount from (some_value & 31) pattern
+ auto getShiftAmount = [&](Register ShiftAmtReg) -> std::optional<Register> {
+ MachineInstr *AndInst = getDefIgnoringCopies(ShiftAmtReg, *MRI);
+ if (AndInst && AndInst->getOpcode() == TargetOpcode::G_AND) {
+ Register AndSrc = AndInst->getOperand(1).getReg();
+ Register AndMask = AndInst->getOperand(2).getReg();
+
+ std::optional<ValueAndVReg> MaskVal =
+ getIConstantVRegValWithLookThrough(AndMask, *MRI);
+ if (MaskVal && MaskVal->Value.getZExtValue() == 31) {
+ return AndSrc;
+ }
+ }
+ return std::nullopt;
+ };
+
+ // Case 2: Variable shift amounts - check the AND/XOR pattern
+ auto ShlAmtSrc = getShiftAmount(ShlAmtReg);
+ auto SrlAmtSrc = getShiftAmount(SrlAmtReg);
+
+ if (!ShlAmtSrc || !SrlAmtSrc)
+ return false;
+
+ MachineInstr *ShlSrcInst = getDefIgnoringCopies(*ShlAmtSrc, *MRI);
+ if (!ShlSrcInst)
+ return false;
+
+ Register OriginalAmt;
+ bool IsRotatePattern = false;
+
+ if (ShlSrcInst->getOpcode() == TargetOpcode::G_XOR) {
+ // FSHR pattern: SHL amount = (~original_amt) & 31
+ Register XorSrc = ShlSrcInst->getOperand(1).getReg();
+ Register XorMask = ShlSrcInst->getOperand(2).getReg();
+
+ std::optional<ValueAndVReg> XorMaskVal =
+ getIConstantVRegValWithLookThrough(XorMask, *MRI);
+ if (!XorMaskVal || XorMaskVal->Value.getSExtValue() != -1)
+ return false;
+
+ if (XorSrc != *SrlAmtSrc)
+ return false;
+
+ OriginalAmt = *SrlAmtSrc;
+ IsRotatePattern = false;
+
+ } else if (ShlSrcInst->getOpcode() == TargetOpcode::G_SUB) {
+ // ROTR pattern: SHL amount = (-original_amt) & 31 = (0 - original_amt) & 31
+ Register SubLHS = ShlSrcInst->getOperand(1).getReg();
+ Register SubRHS = ShlSrcInst->getOperand(2).getReg();
+
+ std::optional<ValueAndVReg> SubLHSVal =
+ getIConstantVRegValWithLookThrough(SubLHS, *MRI);
+ if (!SubLHSVal || SubLHSVal->Value.getZExtValue() != 0)
+ return false;
+
+ if (SubRHS != *SrlAmtSrc)
+ return false;
+
+ OriginalAmt = *SrlAmtSrc;
+ IsRotatePattern = true;
+
+ } else
+ return false;
+
+ // Build V_ALIGNBIT_B32 instruction
+ Register AlignBitSrc0 = ShlSrc;
+ Register AlignBitSrc1 = IsRotatePattern ? ShlSrc : SrlSrc;
+ Register VarShiftAmount = OriginalAmt;
+
+ return buildAlignBitInstruction(AlignBitSrc0, AlignBitSrc1, VarShiftAmount);
+}
+
bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
@@ -4033,6 +4258,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_XOR:
if (selectBITOP3(I))
return true;
+ if (I.getOpcode() == TargetOpcode::G_OR && selectRotateOrFunnelShiftPattern(I))
+ return true;
if (selectImpl(I, *CoverageInfo))
return true;
return selectG_AND_OR_XOR(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 34bdf0a6d4ab2..46cdf813330b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -97,6 +97,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectG_FNEG(MachineInstr &I) const;
bool selectG_FABS(MachineInstr &I) const;
bool selectG_AND_OR_XOR(MachineInstr &I) const;
+ bool selectRotateOrFunnelShiftPattern(MachineInstr &I) const;
bool selectG_ADD_SUB(MachineInstr &I) const;
bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const;
bool selectG_AMDGPU_MAD_64_32(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e7bf88d2ee5b6..b1b19332d870c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2041,13 +2041,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, S32, S64)
.lower();
- getActionDefinitionsBuilder({G_ROTR, G_ROTL})
- .scalarize(0)
- .lower();
+ getActionDefinitionsBuilder({G_ROTR, G_ROTL}).scalarize(0).lower();
// TODO: Only Try to form v2s16 with legal packed instructions.
getActionDefinitionsBuilder(G_FSHR)
- .legalFor({{S32, S32}})
.lowerFor({{V2S16, V2S16}})
.clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b54cccead9781..a280b84a4667b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4089,6 +4089,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_SMED3:
case AMDGPU::G_AMDGPU_FMED3:
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_ROTR:
+ case AMDGPU::G_ROTL: {
+ if (isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fc81e16d68e98..3e65697c07450 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1768,102 +1768,102 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) {
; GFX6-LABEL: s_fshl_v2i24:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008
+; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: s_lshr_b32 s6, s0, 16
-; GFX6-NEXT: s_and_b32 s8, s0, 0xff
-; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT: s_lshl_b32 s9, s9, 8
-; GFX6-NEXT: s_lshr_b32 s7, s1, 8
-; GFX6-NEXT: s_or_b32 s8, s8, s9
+; GFX6-NEXT: s_lshr_b32 s7, s0, 24
+; GFX6-NEXT: s_and_b32 s9, s0, 0xff
+; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008
+; GFX6-NEXT: s_lshl_b32 s0, s0, 8
+; GFX6-NEXT: s_or_b32 s0, s9, s0
; GFX6-NEXT: s_and_b32 s6, s6, 0xff
+; GFX6-NEXT: s_lshr_b32 s8, s1, 8
+; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
; GFX6-NEXT: s_and_b32 s1, s1, 0xff
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_and_b32 s8, 0xffff, s8
+; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT: s_or_b32 s0, s0, s6
+; GFX6-NEXT: s_lshl_b32 s1, s1, 8
+; GFX6-NEXT: s_and_b32 s6, s8, 0xff
+; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT: s_or_b32 s1, s7, s1
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_or_b32 s1, s1, s6
+; GFX6-NEXT: s_lshr_b32 s6, s2, 16
+; GFX6-NEXT: s_lshr_b32 s7, s2, 24
+; GFX6-NEXT: s_and_b32 s9, s2, 0xff
+; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008
+; GFX6-NEXT: s_lshl_b32 s2, s2, 8
+; GFX6-NEXT: v_not_b32_e32 v1, 23
+; GFX6-NEXT: s_or_b32 s2, s9, s2
+; GFX6-NEXT: s_and_b32 s6, s6, 0xff
+; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1
+; GFX6-NEXT: s_lshr_b32 s8, s3, 8
+; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: s_lshl_b32 s6, s6, 16
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_and_b32 s0, s7, 0xff
-; GFX6-NEXT: v_not_b32_e32 v3, 23
-; GFX6-NEXT: s_or_b32 s6, s8, s6
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: s_lshl_b32 s0, s0, 16
-; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80008
-; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_lshr_b32 s0, s2, 16
-; GFX6-NEXT: s_and_b32 s7, s2, 0xff
-; GFX6-NEXT: s_lshl_b32 s8, s8, 8
-; GFX6-NEXT: s_lshr_b32 s1, s3, 8
-; GFX6-NEXT: s_or_b32 s7, s7, s8
-; GFX6-NEXT: s_and_b32 s0, s0, 0xff
; GFX6-NEXT: s_and_b32 s3, s3, 0xff
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: s_and_b32 s7, 0xffff, s7
-; GFX6-NEXT: s_lshl_b32 s0, s0, 16
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
-; GFX6-NEXT: s_or_b32 s0, s7, s0
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT: s_lshl_b32 s1, s1, 16
-; GFX6-NEXT: s_bfe_u32 s7, s4, 0x80008
-; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4
-; GFX6-NEXT: v_or_b32_e32 v1, s1, v1
-; GFX6-NEXT: s_lshr_b32 s1, s4, 16
-; GFX6-NEXT: s_and_b32 s3, s4, 0xff
-; GFX6-NEXT: s_lshl_b32 s7, s7, 8
-; GFX6-NEXT: s_or_b32 s3, s3, s7
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
-; GFX6-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX6-NEXT: s_lshl_b32 s1, s1, 16
-; GFX6-NEXT: s_or_b32 s1, s3, s1
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2
-; GFX6-NEXT: s_lshr_b32 s2, s5, 8
-; GFX6-NEXT: s_and_b32 s3, s5, 0xff
-; GFX6-NEXT: v_mov_b32_e32 v5, s4
-; GFX6-NEXT: v_alignbit_b32 v5, s3, v5, 24
-; GFX6-NEXT: s_and_b32 s2, s2, 0xff
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24
-; GFX6-NEXT: s_lshl_b32 s2, s2, 16
-; GFX6-NEXT: v_or_b32_e32 v5, s2, v5
-; GFX6-NEXT: v_mul_hi_u32 v2, v5, v2
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s1, v4
-; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4
+; GFX6-NEXT: s_or_b32 s2, s2, s6
+; GFX6-NEXT: s_lshl_b32 s3, s3, 8
+; GFX6-NEXT: s_and_b32 s6, s8, 0xff
+; GFX6-NEXT: s_or_b32 s3, s7, s3
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_or_b32 s3, s3, s6
+; GFX6-NEXT: s_lshr_b32 s6, s4, 16
+; GFX6-NEXT: s_lshr_b32 s7, s4, 24
+; GFX6-NEXT: s_and_b32 s9, s4, 0xff
+; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008
+; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT: s_lshl_b32 s4, s4, 8
+; GFX6-NEXT: s_or_b32 s4, s9, s4
+; GFX6-NEXT: s_and_b32 s6, s6, 0xff
+; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_or_b32 s4, s4, s6
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s4, v0
+; GFX6-NEXT: s_lshr_b32 s8, s5, 8
+; GFX6-NEXT: s_and_b32 s5, s5, 0xff
+; GFX6-NEXT: s_lshl_b32 s5, s5, 8
; GFX6-NEXT: v_mul_lo_u32 v2, v2, 24
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4
-; GFX6-NEXT: v_add_i32_e32 v5, vcc, v2, v3
+; GFX6-NEXT: s_and_b32 s6, s8, 0xff
+; GFX6-NEXT: s_or_b32 s5, s7, s5
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_or_b32 s5, s5, s6
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
+; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v3
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4
-; GFX6-NEXT: s_lshr_b32 s0, s0, 1
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6
+; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX6-NEXT: v_lshl_b32_e32 v4, s6, v4
-; GFX6-NEXT: v_lshr_b32_e32 v6, s0, v6
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2
; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
-; GFX6-NEXT: v_bfe_u32 v2, v4, 8, 8
+; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2
+; GFX6-NEXT: s_lshr_b32 s0, s2, 1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; GFX6-NEXT: v_lshr_b32_e32 v3, s0, v3
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s5, v0
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v1
+; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1
+; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; GFX6-NEXT: s_lshr_b32 s0, s3, 1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; GFX6-NEXT: v_lshl_b32_e32 v0, s1, v0
+; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
+; GFX6-NEXT: v_bfe_u32 v3, v2, 8, 8
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT: v_bfe_u32 v2, v4, 16, 8
+; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX6-NEXT: v_bfe_u32 v2, v2, 16, 8
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0
@@ -2568,156 +2568,124 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
}
define amdgpu_ps i32 @s_fshl_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) {
-; GFX6-LABEL: s_fshl_i32:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: s_not_b32 s1, s2
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX6-NEXT: s_lshr_b32 s0, s0, 1
-; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshl_i32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: s_not_b32 s1, s2
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshl_i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: s_not_b32 s1, s2
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX9-NEXT: s_lshr_b32 s0, s0, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshl_i32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1
-; GFX10-NEXT: s_lshr_b32 s0, s0, 1
-; GFX10-NEXT: s_not_b32 s1, s2
-; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshl_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_not_b32 s3, s2
+; GCN-NEXT: s_lshr_b32 s1, s1, 1
+; GCN-NEXT: s_lshl_b32 s0, s0, s2
+; GCN-NEXT: s_lshr_b32 s1, s1, s3
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1
-; GFX11-NEXT: s_lshr_b32 s0, s0, 1
-; GFX11-NEXT: s_not_b32 s1, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_not_b32 s3, s2
+; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_lshl_b32 s0, s0, s2
+; GFX11-NEXT: s_lshr_b32 s1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
ret i32 %result
}
define amdgpu_ps i32 @s_fshl_i32_5(i32 inreg %lhs, i32 inreg %rhs) {
-; GFX6-LABEL: s_fshl_i32_5:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 27
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshl_i32_5:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 27
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshl_i32_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 27
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshl_i32_5:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 27
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshl_i32_5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 5
+; GCN-NEXT: s_lshr_b32 s1, s1, 27
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i32_5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 27
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-NEXT: s_lshr_b32 s1, s1, 27
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 5)
ret i32 %result
}
define amdgpu_ps i32 @s_fshl_i32_8(i32 inreg %lhs, i32 inreg %rhs) {
-; GFX6-LABEL: s_fshl_i32_8:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 24
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshl_i32_8:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 24
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshl_i32_8:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 24
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshl_i32_8:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 24
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshl_i32_8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 8
+; GCN-NEXT: s_lshr_b32 s1, s1, 24
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i32_8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_lshl_b32 s0, s0, 8
+; GFX11-NEXT: s_lshr_b32 s1, s1, 24
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 8)
ret i32 %result
}
define i32 @v_fshl_i32(i32 %lhs, i32 %rhs, i32 %amt) {
-; GCN-LABEL: v_fshl_i32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_alignbit_b32 v1, v0, v1, 1
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; GCN-NEXT: v_not_b32_e32 v2, v2
-; GCN-NEXT: v_alignbit_b32 v0, v0, v1, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: v_fshl_i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v3, 31, v2
+; GFX6-NEXT: v_not_b32_e32 v2, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 31, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v3, 31, v2
+; GFX8-NEXT: v_not_b32_e32 v2, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 31, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v3, 31, v2
+; GFX9-NEXT: v_not_b32_e32 v2, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 31, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, v2, v1
+; GFX9-NEXT: v_lshl_or_b32 v0, v0, v3, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_not_b32_e32 v3, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 31, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 31, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1
+; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fshl_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, v0, v1, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_not_b32_e32 v2, v2
+; GFX11-NEXT: v_not_b32_e32 v3, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 31, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v3, 31, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, v3, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, v2, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
ret i32 %result
@@ -2758,46 +2726,56 @@ define i32 @v_fshl_i32_8(i32 %lhs, i32 %rhs) {
define amdgpu_ps float @v_fshl_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt) {
; GFX6-LABEL: v_fshl_i32_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: v_alignbit_b32 v1, s0, v1, 1
-; GFX6-NEXT: s_lshr_b32 s0, s0, 1
+; GFX6-NEXT: v_and_b32_e32 v1, 31, v0
; GFX6-NEXT: v_not_b32_e32 v0, v0
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v1, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 31, v0
+; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1
+; GFX6-NEXT: s_lshr_b32 s0, s1, 1
+; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshl_i32_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_alignbit_b32 v1, s0, v1, 1
-; GFX8-NEXT: s_lshr_b32 s0, s0, 1
+; GFX8-NEXT: v_and_b32_e32 v1, 31, v0
; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 31, v0
+; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT: s_lshr_b32 s0, s1, 1
+; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshl_i32_ssv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1
-; GFX9-NEXT: s_lshr_b32 s0, s0, 1
+; GFX9-NEXT: v_and_b32_e32 v1, 31, v0
; GFX9-NEXT: v_not_b32_e32 v0, v0
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v1, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 31, v0
+; GFX9-NEXT: s_lshr_b32 s1, s1, 1
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s1
+; GFX9-NEXT: v_lshl_or_b32 v0, s0, v1, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshl_i32_ssv:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_alignbit_b32 v1, s0, s1, 1
-; GFX10-NEXT: v_not_b32_e32 v0, v0
-; GFX10-NEXT: s_lshr_b32 s0, s0, 1
-; GFX10-NEXT: v_alignbit_b32 v0, s0, v1, v0
+; GFX10-NEXT: v_not_b32_e32 v1, v0
+; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: v_and_b32_e32 v0, 31, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 31, v1
+; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s1
+; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshl_i32_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_alignbit_b32 v1, s0, s1, 1
-; GFX11-NEXT: v_not_b32_e32 v0, v0
-; GFX11-NEXT: s_lshr_b32 s0, s0, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_alignbit_b32 v0, s0, v1, v0
+; GFX11-NEXT: v_not_b32_e32 v1, v0
+; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: v_and_b32_e32 v0, 31, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 31, v1
+; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v1
; GFX11-NEXT: ; return to shader part epilog
%result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
%cast.result = bitcast i32 %result to float
@@ -2807,46 +2785,48 @@ define amdgpu_ps float @v_fshl_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt)
define amdgpu_ps float @v_fshl_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) {
; GFX6-LABEL: v_fshl_i32_svs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_not_b32 s1, s1
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX6-NEXT: s_lshr_b32 s0, s0, 1
-; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1
+; GFX6-NEXT: s_andn2_b32 s2, 31, s1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: s_lshl_b32 s0, s0, s1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0
+; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshl_i32_svs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_not_b32 s1, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1
+; GFX8-NEXT: s_andn2_b32 s2, 31, s1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX8-NEXT: s_lshl_b32 s0, s0, s1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s2, v0
+; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshl_i32_svs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_not_b32 s1, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX9-NEXT: s_lshr_b32 s0, s0, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1
+; GFX9-NEXT: s_andn2_b32 s2, 31, s1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_lshl_b32 s0, s0, s1
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, s2, v0
+; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshl_i32_svs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX10-NEXT: s_lshr_b32 s0, s0, 1
-; GFX10-NEXT: s_not_b32 s1, s1
-; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX10-NEXT: s_andn2_b32 s2, 31, s1
+; GFX10-NEXT: s_lshl_b32 s0, s0, s1
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s2, v0
+; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshl_i32_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX11-NEXT: s_lshr_b32 s0, s0, 1
-; GFX11-NEXT: s_not_b32 s1, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_and_not1_b32 s2, 31, s1
+; GFX11-NEXT: s_lshl_b32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s2, v0
+; GFX11-NEXT: v_or_b32_e32 v0, s0, v0
; GFX11-NEXT: ; return to shader part epilog
%result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
%cast.result = bitcast i32 %result to float
@@ -2854,51 +2834,25 @@ define amdgpu_ps float @v_fshl_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt)
}
define amdgpu_ps float @v_fshl_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) {
-; GFX6-LABEL: v_fshl_i32_vss:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: s_not_b32 s1, s2
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX6-NEXT: s_lshr_b32 s0, s0, 1
-; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: v_fshl_i32_vss:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: s_not_b32 s1, s2
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: v_fshl_i32_vss:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: s_not_b32 s1, s2
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX9-NEXT: s_lshr_b32 s0, s0, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: v_fshl_i32_vss:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1
-; GFX10-NEXT: s_lshr_b32 s0, s0, 1
-; GFX10-NEXT: s_not_b32 s1, s2
-; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: v_fshl_i32_vss:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_not_b32 s3, s2
+; GCN-NEXT: s_lshr_b32 s1, s1, 1
+; GCN-NEXT: s_lshl_b32 s0, s0, s2
+; GCN-NEXT: s_lshr_b32 s1, s1, s3
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshl_i32_vss:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1
-; GFX11-NEXT: s_lshr_b32 s0, s0, 1
-; GFX11-NEXT: s_not_b32 s1, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1
+; GFX11-NEXT: s_not_b32 s3, s2
+; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_lshl_b32 s0, s0, s2
+; GFX11-NEXT: s_lshr_b32 s1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: ; return to shader part epilog
%result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
%cast.result = bitcast i32 %result to float
@@ -2909,67 +2863,92 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
; GFX6-LABEL: v_fshl_v2i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_alignbit_b32 v2, v0, v2, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_and_b32_e32 v6, 31, v4
; GFX6-NEXT: v_not_b32_e32 v4, v4
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX6-NEXT: v_alignbit_b32 v2, v1, v3, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_not_b32_e32 v3, v5
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v2, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 31, v5
+; GFX6-NEXT: v_not_b32_e32 v4, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_alignbit_b32 v2, v0, v2, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_and_b32_e32 v6, 31, v4
; GFX8-NEXT: v_not_b32_e32 v4, v4
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX8-NEXT: v_alignbit_b32 v2, v1, v3, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX8-NEXT: v_not_b32_e32 v3, v5
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, v6, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 31, v5
+; GFX8-NEXT: v_not_b32_e32 v4, v5
+; GFX8-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshl_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_alignbit_b32 v2, v0, v2, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_and_b32_e32 v6, 31, v4
; GFX9-NEXT: v_not_b32_e32 v4, v4
-; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX9-NEXT: v_alignbit_b32 v2, v1, v3, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_not_b32_e32 v3, v5
-; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3
+; GFX9-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, v4, v2
+; GFX9-NEXT: v_not_b32_e32 v4, v5
+; GFX9-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX9-NEXT: v_lshl_or_b32 v0, v0, v6, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 31, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX9-NEXT: v_lshl_or_b32 v1, v1, v2, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshl_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v2, v0, v2, 1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_not_b32_e32 v4, v4
-; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_not_b32_e32 v5, v5
-; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
+; GFX10-NEXT: v_not_b32_e32 v6, v4
+; GFX10-NEXT: v_not_b32_e32 v7, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX10-NEXT: v_and_b32_e32 v6, 31, v6
+; GFX10-NEXT: v_and_b32_e32 v7, 31, v7
+; GFX10-NEXT: v_and_b32_e32 v5, 31, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3
+; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2
+; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fshl_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v2, v0, v2, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: v_alignbit_b32 v3, v1, v3, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5
+; GFX11-NEXT: v_not_b32_e32 v6, v4
+; GFX11-NEXT: v_not_b32_e32 v7, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX11-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX11-NEXT: v_and_b32_e32 v6, 31, v6
+; GFX11-NEXT: v_and_b32_e32 v7, 31, v7
+; GFX11-NEXT: v_and_b32_e32 v5, 31, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v6, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, v4, v2
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt)
ret <2 x i32> %result
@@ -2979,87 +2958,123 @@ define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
; GFX6-LABEL: v_fshl_v3i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_alignbit_b32 v3, v0, v3, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_and_b32_e32 v9, 31, v6
; GFX6-NEXT: v_not_b32_e32 v6, v6
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v3, v6
-; GFX6-NEXT: v_alignbit_b32 v3, v1, v4, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_not_b32_e32 v4, v7
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v5, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 31, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 31, v7
+; GFX6-NEXT: v_not_b32_e32 v6, v7
+; GFX6-NEXT: v_and_b32_e32 v6, 31, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 31, v8
; GFX6-NEXT: v_not_b32_e32 v4, v8
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v3, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v3i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_alignbit_b32 v3, v0, v3, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_and_b32_e32 v9, 31, v6
; GFX8-NEXT: v_not_b32_e32 v6, v6
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, v6
-; GFX8-NEXT: v_alignbit_b32 v3, v1, v4, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX8-NEXT: v_not_b32_e32 v4, v7
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v3, v4
-; GFX8-NEXT: v_alignbit_b32 v3, v2, v5, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX8-NEXT: v_and_b32_e32 v6, 31, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, v9, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v6, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 31, v7
+; GFX8-NEXT: v_not_b32_e32 v6, v7
+; GFX8-NEXT: v_and_b32_e32 v6, 31, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v6, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 31, v8
; GFX8-NEXT: v_not_b32_e32 v4, v8
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v3, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshl_v3i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_alignbit_b32 v3, v0, v3, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_and_b32_e32 v9, 31, v6
; GFX9-NEXT: v_not_b32_e32 v6, v6
-; GFX9-NEXT: v_alignbit_b32 v0, v0, v3, v6
-; GFX9-NEXT: v_alignbit_b32 v3, v1, v4, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_not_b32_e32 v4, v7
-; GFX9-NEXT: v_alignbit_b32 v1, v1, v3, v4
-; GFX9-NEXT: v_alignbit_b32 v3, v2, v5, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_and_b32_e32 v6, 31, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, v6, v3
+; GFX9-NEXT: v_not_b32_e32 v6, v7
+; GFX9-NEXT: v_and_b32_e32 v6, 31, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; GFX9-NEXT: v_lshl_or_b32 v0, v0, v9, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 31, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, v6, v4
+; GFX9-NEXT: v_lshl_or_b32 v1, v1, v3, v4
; GFX9-NEXT: v_not_b32_e32 v4, v8
-; GFX9-NEXT: v_alignbit_b32 v2, v2, v3, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 31, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v5
+; GFX9-NEXT: v_and_b32_e32 v3, 31, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, v4, v5
+; GFX9-NEXT: v_lshl_or_b32 v2, v2, v3, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshl_v3i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v3, v0, v3, 1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_not_b32_e32 v6, v6
-; GFX10-NEXT: v_alignbit_b32 v4, v1, v4, 1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_not_b32_e32 v7, v7
-; GFX10-NEXT: v_alignbit_b32 v5, v2, v5, 1
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2
-; GFX10-NEXT: v_not_b32_e32 v8, v8
-; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6
-; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7
-; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8
+; GFX10-NEXT: v_not_b32_e32 v9, v6
+; GFX10-NEXT: v_not_b32_e32 v10, v7
+; GFX10-NEXT: v_not_b32_e32 v11, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; GFX10-NEXT: v_and_b32_e32 v9, 31, v9
+; GFX10-NEXT: v_and_b32_e32 v10, 31, v10
+; GFX10-NEXT: v_and_b32_e32 v11, 31, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v5
+; GFX10-NEXT: v_and_b32_e32 v6, 31, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, v9, v3
+; GFX10-NEXT: v_and_b32_e32 v7, 31, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v10, v4
+; GFX10-NEXT: v_and_b32_e32 v8, 31, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, v11, v5
+; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v3
+; GFX10-NEXT: v_lshl_or_b32 v1, v1, v7, v4
+; GFX10-NEXT: v_lshl_or_b32 v2, v2, v8, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fshl_v3i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v3, v0, v3, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_not_b32_e32 v6, v6
-; GFX11-NEXT: v_alignbit_b32 v4, v1, v4, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_not_b32_e32 v7, v7
-; GFX11-NEXT: v_alignbit_b32 v5, v2, v5, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2
-; GFX11-NEXT: v_not_b32_e32 v8, v8
-; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6
-; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8
+; GFX11-NEXT: v_not_b32_e32 v9, v6
+; GFX11-NEXT: v_not_b32_e32 v10, v7
+; GFX11-NEXT: v_not_b32_e32 v11, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; GFX11-NEXT: v_and_b32_e32 v9, 31, v9
+; GFX11-NEXT: v_and_b32_e32 v10, 31, v10
+; GFX11-NEXT: v_and_b32_e32 v11, 31, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 1, v5
+; GFX11-NEXT: v_and_b32_e32 v6, 31, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v9, v3
+; GFX11-NEXT: v_and_b32_e32 v7, 31, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, v10, v4
+; GFX11-NEXT: v_and_b32_e32 v8, 31, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v11, v5
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, v7, v4
+; GFX11-NEXT: v_lshl_or_b32 v2, v2, v8, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <3 x i32> @llvm.fshl.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt)
ret <3 x i32> %result
@@ -3069,107 +3084,155 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
; GFX6-LABEL: v_fshl_v4i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_alignbit_b32 v4, v0, v4, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_and_b32_e32 v12, 31, v8
; GFX6-NEXT: v_not_b32_e32 v8, v8
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, v8
-; GFX6-NEXT: v_alignbit_b32 v4, v1, v5, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_not_b32_e32 v5, v9
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v4, v5
-; GFX6-NEXT: v_alignbit_b32 v4, v2, v6, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT: v_and_b32_e32 v8, 31, v8
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v12, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 31, v9
+; GFX6-NEXT: v_not_b32_e32 v8, v9
+; GFX6-NEXT: v_and_b32_e32 v8, 31, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 31, v10
; GFX6-NEXT: v_not_b32_e32 v5, v10
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, v5
-; GFX6-NEXT: v_alignbit_b32 v4, v3, v7, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX6-NEXT: v_and_b32_e32 v5, 31, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 31, v11
; GFX6-NEXT: v_not_b32_e32 v5, v11
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 31, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v7
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v4i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_alignbit_b32 v4, v0, v4, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_and_b32_e32 v12, 31, v8
; GFX8-NEXT: v_not_b32_e32 v8, v8
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v4, v8
-; GFX8-NEXT: v_alignbit_b32 v4, v1, v5, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX8-NEXT: v_not_b32_e32 v5, v9
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, v5
-; GFX8-NEXT: v_alignbit_b32 v4, v2, v6, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX8-NEXT: v_and_b32_e32 v8, 31, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, v12, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v8, v4
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 31, v9
+; GFX8-NEXT: v_not_b32_e32 v8, v9
+; GFX8-NEXT: v_and_b32_e32 v8, 31, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, v4, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v8, v4
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 31, v10
; GFX8-NEXT: v_not_b32_e32 v5, v10
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v4, v5
-; GFX8-NEXT: v_alignbit_b32 v4, v3, v7, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 31, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v5, v4
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 31, v11
; GFX8-NEXT: v_not_b32_e32 v5, v11
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v4, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 31, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, v4, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, v5, v4
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshl_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_alignbit_b32 v4, v0, v4, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_and_b32_e32 v12, 31, v8
; GFX9-NEXT: v_not_b32_e32 v8, v8
-; GFX9-NEXT: v_alignbit_b32 v0, v0, v4, v8
-; GFX9-NEXT: v_alignbit_b32 v4, v1, v5, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_not_b32_e32 v5, v9
-; GFX9-NEXT: v_alignbit_b32 v1, v1, v4, v5
-; GFX9-NEXT: v_alignbit_b32 v4, v2, v6, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_and_b32_e32 v8, 31, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, v8, v4
+; GFX9-NEXT: v_not_b32_e32 v8, v9
+; GFX9-NEXT: v_and_b32_e32 v8, 31, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v5
+; GFX9-NEXT: v_lshl_or_b32 v0, v0, v12, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 31, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, v8, v5
+; GFX9-NEXT: v_lshl_or_b32 v1, v1, v4, v5
; GFX9-NEXT: v_not_b32_e32 v5, v10
-; GFX9-NEXT: v_alignbit_b32 v2, v2, v4, v5
-; GFX9-NEXT: v_alignbit_b32 v4, v3, v7, 1
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX9-NEXT: v_and_b32_e32 v5, 31, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v6
+; GFX9-NEXT: v_and_b32_e32 v4, 31, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, v5, v6
+; GFX9-NEXT: v_lshl_or_b32 v2, v2, v4, v5
; GFX9-NEXT: v_not_b32_e32 v5, v11
-; GFX9-NEXT: v_alignbit_b32 v3, v3, v4, v5
+; GFX9-NEXT: v_and_b32_e32 v5, 31, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v7
+; GFX9-NEXT: v_and_b32_e32 v4, 31, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, v5, v6
+; GFX9-NEXT: v_lshl_or_b32 v3, v3, v4, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshl_v4i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v4, v0, v4, 1
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_not_b32_e32 v8, v8
-; GFX10-NEXT: v_alignbit_b32 v5, v1, v5, 1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_not_b32_e32 v9, v9
-; GFX10-NEXT: v_alignbit_b32 v6, v2, v6, 1
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2
-; GFX10-NEXT: v_not_b32_e32 v10, v10
-; GFX10-NEXT: v_alignbit_b32 v7, v3, v7, 1
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3
-; GFX10-NEXT: v_not_b32_e32 v11, v11
-; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8
-; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9
-; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10
-; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11
+; GFX10-NEXT: v_not_b32_e32 v12, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; GFX10-NEXT: v_not_b32_e32 v13, v9
+; GFX10-NEXT: v_not_b32_e32 v14, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v5
+; GFX10-NEXT: v_and_b32_e32 v12, 31, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v6
+; GFX10-NEXT: v_and_b32_e32 v13, 31, v13
+; GFX10-NEXT: v_and_b32_e32 v14, 31, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 1, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, v12, v4
+; GFX10-NEXT: v_not_b32_e32 v12, v10
+; GFX10-NEXT: v_and_b32_e32 v8, 31, v8
+; GFX10-NEXT: v_and_b32_e32 v9, 31, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, v13, v5
+; GFX10-NEXT: v_and_b32_e32 v10, 31, v10
+; GFX10-NEXT: v_and_b32_e32 v12, 31, v12
+; GFX10-NEXT: v_and_b32_e32 v11, 31, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, v14, v7
+; GFX10-NEXT: v_lshl_or_b32 v0, v0, v8, v4
+; GFX10-NEXT: v_lshl_or_b32 v1, v1, v9, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, v12, v6
+; GFX10-NEXT: v_lshl_or_b32 v3, v3, v11, v7
+; GFX10-NEXT: v_lshl_or_b32 v2, v2, v10, v6
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fshl_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v4, v0, v4, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_not_b32_e32 v8, v8
-; GFX11-NEXT: v_alignbit_b32 v5, v1, v5, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_not_b32_e32 v9, v9
-; GFX11-NEXT: v_alignbit_b32 v6, v2, v6, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2
-; GFX11-NEXT: v_not_b32_e32 v10, v10
-; GFX11-NEXT: v_alignbit_b32 v7, v3, v7, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3
-; GFX11-NEXT: v_not_b32_e32 v11, v11
-; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8
-; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9
-; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11
+; GFX11-NEXT: v_not_b32_e32 v12, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; GFX11-NEXT: v_not_b32_e32 v13, v9
+; GFX11-NEXT: v_not_b32_e32 v14, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 1, v5
+; GFX11-NEXT: v_and_b32_e32 v12, 31, v12
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v6
+; GFX11-NEXT: v_and_b32_e32 v13, 31, v13
+; GFX11-NEXT: v_and_b32_e32 v14, 31, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 1, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, v12, v4
+; GFX11-NEXT: v_not_b32_e32 v12, v10
+; GFX11-NEXT: v_and_b32_e32 v8, 31, v8
+; GFX11-NEXT: v_and_b32_e32 v9, 31, v9
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v13, v5
+; GFX11-NEXT: v_and_b32_e32 v10, 31, v10
+; GFX11-NEXT: v_and_b32_e32 v12, 31, v12
+; GFX11-NEXT: v_and_b32_e32 v11, 31, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, v14, v7
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, v8, v4
+; GFX11-NEXT: v_lshl_or_b32 v1, v1, v9, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, v12, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v3, v3, v11, v7
+; GFX11-NEXT: v_lshl_or_b32 v2, v2, v10, v6
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt)
ret <4 x i32> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 238cc06fc7f7c..e4a07748cf929 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -1783,102 +1783,102 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) {
; GFX6-LABEL: s_fshr_v2i24:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008
-; GFX6-NEXT: v_not_b32_e32 v3, 23
-; GFX6-NEXT: s_lshr_b32 s7, s1, 8
-; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT: s_and_b32 s8, s0, 0xff
-; GFX6-NEXT: s_lshl_b32 s9, s9, 8
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_bfe_u32 s10, s2, 0x80008
-; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3
-; GFX6-NEXT: s_or_b32 s8, s8, s9
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_lshr_b32 s1, s2, 16
-; GFX6-NEXT: s_and_b32 s9, s2, 0xff
-; GFX6-NEXT: s_lshl_b32 s10, s10, 8
+; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: s_lshr_b32 s6, s0, 16
-; GFX6-NEXT: s_and_b32 s0, s7, 0xff
-; GFX6-NEXT: s_lshr_b32 s7, s3, 8
-; GFX6-NEXT: s_or_b32 s9, s9, s10
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
+; GFX6-NEXT: s_lshr_b32 s7, s0, 24
+; GFX6-NEXT: s_and_b32 s9, s0, 0xff
+; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008
+; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT: s_lshl_b32 s0, s0, 8
+; GFX6-NEXT: s_or_b32 s0, s9, s0
+; GFX6-NEXT: s_lshr_b32 s9, s2, 16
+; GFX6-NEXT: s_lshr_b32 s10, s2, 24
+; GFX6-NEXT: s_and_b32 s12, s2, 0xff
+; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008
+; GFX6-NEXT: s_lshl_b32 s2, s2, 8
+; GFX6-NEXT: v_not_b32_e32 v1, 23
+; GFX6-NEXT: s_or_b32 s2, s12, s2
+; GFX6-NEXT: s_and_b32 s9, s9, 0xff
+; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1
+; GFX6-NEXT: s_lshr_b32 s11, s3, 8
+; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX6-NEXT: s_lshl_b32 s9, s9, 16
; GFX6-NEXT: s_and_b32 s3, s3, 0xff
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: s_and_b32 s9, 0xffff, s9
-; GFX6-NEXT: s_lshl_b32 s1, s1, 16
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24
-; GFX6-NEXT: s_and_b32 s2, s7, 0xff
-; GFX6-NEXT: s_or_b32 s1, s9, s1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT: s_lshl_b32 s2, s2, 16
-; GFX6-NEXT: s_bfe_u32 s9, s4, 0x80008
-; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4
-; GFX6-NEXT: v_or_b32_e32 v1, s2, v1
-; GFX6-NEXT: s_lshr_b32 s2, s4, 16
-; GFX6-NEXT: s_and_b32 s7, s4, 0xff
-; GFX6-NEXT: s_lshl_b32 s9, s9, 8
-; GFX6-NEXT: s_or_b32 s7, s7, s9
-; GFX6-NEXT: s_and_b32 s2, s2, 0xff
-; GFX6-NEXT: s_and_b32 s7, 0xffff, s7
-; GFX6-NEXT: s_lshl_b32 s2, s2, 16
-; GFX6-NEXT: s_or_b32 s2, s7, s2
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2
-; GFX6-NEXT: s_lshr_b32 s3, s5, 8
+; GFX6-NEXT: s_or_b32 s2, s2, s9
+; GFX6-NEXT: s_lshl_b32 s3, s3, 8
+; GFX6-NEXT: s_and_b32 s9, s11, 0xff
+; GFX6-NEXT: s_or_b32 s3, s10, s3
+; GFX6-NEXT: s_lshl_b32 s9, s9, 16
+; GFX6-NEXT: s_or_b32 s3, s3, s9
+; GFX6-NEXT: s_lshr_b32 s9, s4, 16
+; GFX6-NEXT: s_lshr_b32 s10, s4, 24
+; GFX6-NEXT: s_and_b32 s12, s4, 0xff
+; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008
+; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT: s_lshl_b32 s4, s4, 8
+; GFX6-NEXT: s_or_b32 s4, s12, s4
+; GFX6-NEXT: s_and_b32 s9, s9, 0xff
+; GFX6-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX6-NEXT: s_lshl_b32 s9, s9, 16
+; GFX6-NEXT: s_or_b32 s4, s4, s9
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s4, v0
+; GFX6-NEXT: s_lshr_b32 s11, s5, 8
; GFX6-NEXT: s_and_b32 s5, s5, 0xff
-; GFX6-NEXT: v_mov_b32_e32 v5, s4
-; GFX6-NEXT: v_alignbit_b32 v5, s5, v5, 24
-; GFX6-NEXT: s_and_b32 s3, s3, 0xff
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24
-; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: v_or_b32_e32 v5, s3, v5
-; GFX6-NEXT: v_mul_hi_u32 v2, v5, v2
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s2, v4
-; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4
+; GFX6-NEXT: s_lshl_b32 s5, s5, 8
; GFX6-NEXT: v_mul_lo_u32 v2, v2, 24
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4
-; GFX6-NEXT: v_add_i32_e32 v5, vcc, v2, v3
+; GFX6-NEXT: s_and_b32 s9, s11, 0xff
+; GFX6-NEXT: s_or_b32 s5, s10, s5
+; GFX6-NEXT: s_lshl_b32 s9, s9, 16
+; GFX6-NEXT: s_or_b32 s5, s5, s9
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
+; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT: s_and_b32 s6, s6, 0xff
-; GFX6-NEXT: s_and_b32 s8, 0xffff, s8
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v3
-; GFX6-NEXT: s_lshl_b32 s2, s6, 17
-; GFX6-NEXT: s_lshl_b32 s3, s8, 1
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4
+; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24
+; GFX6-NEXT: s_and_b32 s6, s6, 0xff
+; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX6-NEXT: v_lshl_b32_e32 v6, s2, v6
-; GFX6-NEXT: v_lshr_b32_e32 v4, s1, v4
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2
-; GFX6-NEXT: s_lshl_b32 s0, s0, 17
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_or_b32_e32 v4, v6, v4
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT: s_lshl_b32 s4, s6, 17
+; GFX6-NEXT: s_lshl_b32 s0, s0, 1
+; GFX6-NEXT: s_or_b32 s0, s4, s0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
-; GFX6-NEXT: v_bfe_u32 v2, v4, 8, 8
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT: v_bfe_u32 v2, v4, 16, 8
+; GFX6-NEXT: v_lshl_b32_e32 v3, s0, v3
+; GFX6-NEXT: v_lshr_b32_e32 v2, s2, v2
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s5, v0
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v1
+; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1
+; GFX6-NEXT: s_lshr_b32 s8, s1, 8
+; GFX6-NEXT: s_and_b32 s1, s1, 0xff
+; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT: s_and_b32 s8, s8, 0xff
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT: s_lshl_b32 s1, s1, 9
+; GFX6-NEXT: s_lshl_b32 s2, s7, 1
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0
+; GFX6-NEXT: s_lshl_b32 s0, s8, 17
+; GFX6-NEXT: s_or_b32 s1, s1, s2
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1
+; GFX6-NEXT: v_lshr_b32_e32 v0, s3, v0
+; GFX6-NEXT: v_bfe_u32 v3, v2, 8, 8
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX6-NEXT: v_bfe_u32 v2, v2, 16, 8
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0
@@ -2592,117 +2592,86 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
define amdgpu_ps i32 @s_fshr_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) {
; GFX6-LABEL: s_fshr_i32:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_not_b32 s3, s2
+; GFX6-NEXT: s_lshl_b32 s0, s0, 1
+; GFX6-NEXT: s_lshl_b32 s0, s0, s3
+; GFX6-NEXT: s_lshr_b32 s1, s1, s2
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshr_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_not_b32 s3, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_lshl_b32 s0, s0, s3
+; GFX8-NEXT: s_lshr_b32 s1, s1, s2
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshr_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_not_b32 s3, s2
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_lshl_b32 s0, s0, s3
+; GFX9-NEXT: s_lshr_b32 s1, s1, s2
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshr_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_not_b32 s3, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s3
+; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_not_b32 s3, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt)
ret i32 %result
}
define amdgpu_ps i32 @s_fshr_i32_5(i32 inreg %lhs, i32 inreg %rhs) {
-; GFX6-LABEL: s_fshr_i32_5:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 5
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshr_i32_5:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 5
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshr_i32_5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 5
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshr_i32_5:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 5
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshr_i32_5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 27
+; GCN-NEXT: s_lshr_b32 s1, s1, 5
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i32_5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_lshl_b32 s0, s0, 27
+; GFX11-NEXT: s_lshr_b32 s1, s1, 5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 5)
ret i32 %result
}
define amdgpu_ps i32 @s_fshr_i32_8(i32 inreg %lhs, i32 inreg %rhs) {
-; GFX6-LABEL: s_fshr_i32_8:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 8
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshr_i32_8:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 8
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshr_i32_8:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 8
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshr_i32_8:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 8
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshr_i32_8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 24
+; GCN-NEXT: s_lshr_b32 s1, s1, 8
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i32_8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_lshl_b32 s0, s0, 24
+; GFX11-NEXT: s_lshr_b32 s1, s1, 8
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 8)
ret i32 %result
@@ -2792,20 +2761,20 @@ define amdgpu_ps float @v_fshr_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt)
define amdgpu_ps float @v_fshr_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) {
; GFX6-LABEL: v_fshr_i32_svs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1
+; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshr_i32_svs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshr_i32_svs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshr_i32_svs:
@@ -2825,36 +2794,53 @@ define amdgpu_ps float @v_fshr_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt)
define amdgpu_ps float @v_fshr_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) {
; GFX6-LABEL: v_fshr_i32_vss:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1
+; GFX6-NEXT: s_not_b32 s3, s2
+; GFX6-NEXT: s_lshl_b32 s0, s0, 1
+; GFX6-NEXT: s_lshl_b32 s0, s0, s3
+; GFX6-NEXT: s_lshr_b32 s1, s1, s2
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshr_i32_vss:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1
+; GFX8-NEXT: s_not_b32 s3, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_lshl_b32 s0, s0, s3
+; GFX8-NEXT: s_lshr_b32 s1, s1, s2
+; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshr_i32_vss:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1
+; GFX9-NEXT: s_not_b32 s3, s2
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_lshl_b32 s0, s0, s3
+; GFX9-NEXT: s_lshr_b32 s1, s1, s2
+; GFX9-NEXT: s_or_b32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshr_i32_vss:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0
+; GFX10-NEXT: s_not_b32 s3, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s3
+; GFX10-NEXT: s_or_b32 s0, s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshr_i32_vss:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0
+; GFX11-NEXT: s_not_b32 s3, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: ; return to shader part epilog
%result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt)
%cast.result = bitcast i32 %result to float
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir
deleted file mode 100644
index 0a4cb3ccf2957..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir
+++ /dev/null
@@ -1,41 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
-
----
-
-name: fshr_s32
-legalized: true
-regBankSelected: true
-
-body: |
- bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2
-
- ; GCN-LABEL: name: fshr_s32
- ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec
- ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]]
- ;
- ; GFX11-LABEL: name: fshr_s32
- ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: [[V_ALIGNBIT_B32_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
- ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_fake16_e64_]]
- %0:vgpr(s32) = COPY $vgpr0
- %1:vgpr(s32) = COPY $vgpr1
- %2:vgpr(s32) = COPY $vgpr2
- %3:vgpr(s32) = G_FSHR %0, %1, %2
- S_ENDPGM 0, implicit %3
-
-...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir
index 240036207bd0d..d3d5b243ca766 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir
@@ -15,13 +15,17 @@ body: |
; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; SI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32)
- ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+ ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; SI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
- ; SI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
- ; SI-NEXT: $vgpr0 = COPY [[FSHR1]](s32)
+ ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
+ ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
+ ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32)
+ ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]]
+ ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
;
; VI-LABEL: name: test_fshl_s32_s32
; VI: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -29,13 +33,17 @@ body: |
; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; VI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32)
- ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+ ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; VI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
- ; VI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
- ; VI-NEXT: $vgpr0 = COPY [[FSHR1]](s32)
+ ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
+ ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
+ ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32)
+ ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]]
+ ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
;
; GFX9-LABEL: name: test_fshl_s32_s32
; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -43,13 +51,17 @@ body: |
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; GFX9-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32)
- ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
- ; GFX9-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
- ; GFX9-NEXT: $vgpr0 = COPY [[FSHR1]](s32)
+ ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
+ ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
+ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32)
+ ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -72,17 +84,24 @@ body: |
; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
; SI-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
; SI-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
- ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; SI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32)
- ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+ ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]]
; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; SI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]]
- ; SI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
- ; SI-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32)
- ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+ ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[AND]](s32)
+ ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
+ ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32)
+ ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]]
+ ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]]
; SI-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]]
- ; SI-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32)
- ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32)
+ ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]]
+ ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[AND2]](s32)
+ ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32)
+ ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[AND3]](s32)
+ ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR3]]
+ ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
;
; VI-LABEL: name: test_fshl_v2s32_v2s32
@@ -94,17 +113,24 @@ body: |
; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
; VI-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
; VI-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
- ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; VI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32)
- ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+ ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]]
; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; VI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]]
- ; VI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
- ; VI-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32)
- ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+ ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[AND]](s32)
+ ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
+ ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32)
+ ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]]
+ ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]]
; VI-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]]
- ; VI-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32)
- ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32)
+ ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]]
+ ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[AND2]](s32)
+ ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32)
+ ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[AND3]](s32)
+ ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR3]]
+ ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
;
; GFX9-LABEL: name: test_fshl_v2s32_v2s32
@@ -116,17 +142,24 @@ body: |
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
- ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; GFX9-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32)
- ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]]
; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]]
- ; GFX9-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32)
- ; GFX9-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32)
- ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
+ ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[AND]](s32)
+ ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
+ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32)
+ ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]]
+ ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]]
; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]]
- ; GFX9-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32)
+ ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]]
+ ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[AND2]](s32)
+ ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32)
+ ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[AND3]](s32)
+ ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR3]]
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
index 0a15cc3824ae7..56969c316e697 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
@@ -17,8 +17,17 @@ body: |
; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; SI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s32)
- ; SI-NEXT: $vgpr0 = COPY [[FSHR]](s32)
+ ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+ ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; SI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
+ ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32)
+ ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
+ ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[AND]](s32)
+ ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]]
+ ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
;
; VI-LABEL: name: test_fshr_s32_s32
; VI: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -26,8 +35,17 @@ body: |
; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; VI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s32)
- ; VI-NEXT: $vgpr0 = COPY [[FSHR]](s32)
+ ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+ ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; VI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
+ ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32)
+ ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
+ ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[AND]](s32)
+ ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]]
+ ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
;
; GFX9-LABEL: name: test_fshr_s32_s32
; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -35,8 +53,17 @@ body: |
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s32)
- ; GFX9-NEXT: $vgpr0 = COPY [[FSHR]](s32)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
+ ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32)
+ ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
+ ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[AND]](s32)
+ ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -59,9 +86,24 @@ body: |
; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
; SI-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
; SI-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
- ; SI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[UV4]](s32)
- ; SI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s32)
- ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32)
+ ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]]
+ ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; SI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]]
+ ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C2]](s32)
+ ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
+ ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[AND]](s32)
+ ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]]
+ ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]]
+ ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]]
+ ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]]
+ ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[C2]](s32)
+ ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND3]](s32)
+ ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[AND2]](s32)
+ ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL3]], [[LSHR1]]
+ ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
;
; VI-LABEL: name: test_fshr_v2s32_v2s32
@@ -73,9 +115,24 @@ body: |
; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
; VI-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
; VI-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
- ; VI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[UV4]](s32)
- ; VI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s32)
- ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32)
+ ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]]
+ ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; VI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]]
+ ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C2]](s32)
+ ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
+ ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[AND]](s32)
+ ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]]
+ ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]]
+ ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]]
+ ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]]
+ ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[C2]](s32)
+ ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND3]](s32)
+ ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[AND2]](s32)
+ ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL3]], [[LSHR1]]
+ ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
;
; GFX9-LABEL: name: test_fshr_v2s32_v2s32
@@ -87,9 +144,24 @@ body: |
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
- ; GFX9-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[UV4]](s32)
- ; GFX9-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s32)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]]
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]]
+ ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
+ ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C2]](s32)
+ ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
+ ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[AND]](s32)
+ ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]]
+ ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]]
+ ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]]
+ ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]]
+ ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[C2]](s32)
+ ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND3]](s32)
+ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[AND2]](s32)
+ ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL3]], [[LSHR1]]
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
index 7fdee12315754..f4b6727c82e99 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
@@ -180,9 +180,14 @@ body: |
; GFX-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[SUB]](s32)
- ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
+ ; GFX-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
+ ; GFX-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
+ ; GFX-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]]
+ ; GFX-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[AND1]](s32)
+ ; GFX-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LSHR]]
+ ; GFX-NEXT: $sgpr0 = COPY [[OR]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_ROTL %0, %1(s32)
@@ -300,15 +305,32 @@ body: |
; GFX-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV4]]
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[SUB]](s32)
+ ; GFX-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C1]]
+ ; GFX-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[AND]](s32)
+ ; GFX-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]]
+ ; GFX-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[AND1]](s32)
+ ; GFX-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LSHR]]
; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV5]]
- ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[SUB1]](s32)
+ ; GFX-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C1]]
+ ; GFX-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[AND2]](s32)
+ ; GFX-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
+ ; GFX-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[AND3]](s32)
+ ; GFX-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[LSHR1]]
; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV6]]
- ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[SUB2]](s32)
+ ; GFX-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV6]], [[C1]]
+ ; GFX-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV2]], [[AND4]](s32)
+ ; GFX-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SUB2]], [[C1]]
+ ; GFX-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[AND5]](s32)
+ ; GFX-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[LSHR2]]
; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV7]]
- ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[SUB3]](s32)
- ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
+ ; GFX-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[UV7]], [[C1]]
+ ; GFX-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[UV3]], [[AND6]](s32)
+ ; GFX-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SUB3]], [[C1]]
+ ; GFX-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[AND7]](s32)
+ ; GFX-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[LSHR3]]
+ ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
@@ -391,8 +413,15 @@ body: |
; GFX-NEXT: {{ $}}
; GFX-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[COPY1]](s32)
- ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
+ ; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+ ; GFX-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
+ ; GFX-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[AND]](s32)
+ ; GFX-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]]
+ ; GFX-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND1]](s32)
+ ; GFX-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+ ; GFX-NEXT: $sgpr0 = COPY [[OR]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_ROTR %0, %1(s32)
@@ -452,11 +481,33 @@ body: |
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
; GFX-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[UV4]](s32)
- ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[UV5]](s32)
- ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[UV6]](s32)
- ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[UV7]](s32)
- ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
+ ; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV4]]
+ ; GFX-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C1]]
+ ; GFX-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[AND]](s32)
+ ; GFX-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]]
+ ; GFX-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[AND1]](s32)
+ ; GFX-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+ ; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV5]]
+ ; GFX-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C1]]
+ ; GFX-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[AND2]](s32)
+ ; GFX-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
+ ; GFX-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[AND3]](s32)
+ ; GFX-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL1]]
+ ; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV6]]
+ ; GFX-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV6]], [[C1]]
+ ; GFX-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[AND4]](s32)
+ ; GFX-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SUB2]], [[C1]]
+ ; GFX-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV2]], [[AND5]](s32)
+ ; GFX-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
+ ; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV7]]
+ ; GFX-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[UV7]], [[C1]]
+ ; GFX-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[AND6]](s32)
+ ; GFX-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SUB3]], [[C1]]
+ ; GFX-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[UV3]], [[AND7]](s32)
+ ; GFX-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL3]]
+ ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fshr.mir
deleted file mode 100644
index b1a55fe7bc42f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fshr.mir
+++ /dev/null
@@ -1,168 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
-
----
-name: fshr_sss
-legalized: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr1, $sgpr2
- ; CHECK-LABEL: name: fshr_sss
- ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
- ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY3]], [[COPY4]], [[COPY5]](s32)
- %0:_(s32) = COPY $sgpr0
- %1:_(s32) = COPY $sgpr1
- %2:_(s32) = COPY $sgpr2
- %3:_(s32) = G_FSHR %0, %1, %2
-...
----
-name: fshr_vss
-legalized: true
-
-body: |
- bb.0:
- liveins: $vgpr0, $sgpr0, $sgpr1
- ; CHECK-LABEL: name: fshr_vss
- ; CHECK: liveins: $vgpr0, $sgpr0, $sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
- ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY]], [[COPY3]], [[COPY4]](s32)
- %0:_(s32) = COPY $vgpr0
- %1:_(s32) = COPY $sgpr0
- %2:_(s32) = COPY $sgpr1
- %3:_(s32) = G_FSHR %0, %1, %2
-...
----
-name: fshr_svs
-legalized: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $vgpr0, $sgpr1
- ; CHECK-LABEL: name: fshr_svs
- ; CHECK: liveins: $sgpr0, $vgpr0, $sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
- ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY3]], [[COPY1]], [[COPY4]](s32)
- %0:_(s32) = COPY $sgpr0
- %1:_(s32) = COPY $vgpr0
- %2:_(s32) = COPY $sgpr1
- %3:_(s32) = G_FSHR %0, %1, %2
-...
----
-name: fshr_ssv
-legalized: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr1, $vgpr0
- ; CHECK-LABEL: name: fshr_ssv
- ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY3]], [[COPY4]], [[COPY2]](s32)
- %0:_(s32) = COPY $sgpr0
- %1:_(s32) = COPY $sgpr1
- %2:_(s32) = COPY $vgpr0
- %3:_(s32) = G_FSHR %0, %1, %2
-...
----
-name: fshr_vvs
-legalized: true
-
-body: |
- bb.0:
- liveins: $vgpr0, $vgpr1, $sgpr0
- ; CHECK-LABEL: name: fshr_vvs
- ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
- ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY]], [[COPY1]], [[COPY3]](s32)
- %0:_(s32) = COPY $vgpr0
- %1:_(s32) = COPY $vgpr1
- %2:_(s32) = COPY $sgpr0
- %3:_(s32) = G_FSHR %0, %1, %2
-...
----
-name: fshr_vsv
-legalized: true
-
-body: |
- bb.0:
- liveins: $vgpr0, $sgpr0, $vgpr1
- ; CHECK-LABEL: name: fshr_vsv
- ; CHECK: liveins: $vgpr0, $sgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY]], [[COPY3]], [[COPY2]](s32)
- %0:_(s32) = COPY $vgpr0
- %1:_(s32) = COPY $sgpr1
- %2:_(s32) = COPY $vgpr1
- %3:_(s32) = G_FSHR %0, %1, %2
-...
----
-name: fshr_svv
-legalized: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fshr_svv
- ; CHECK: liveins: $sgpr0, $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY3]], [[COPY1]], [[COPY2]](s32)
- %0:_(s32) = COPY $sgpr0
- %1:_(s32) = COPY $vgpr0
- %2:_(s32) = COPY $vgpr1
- %3:_(s32) = G_FSHR %0, %1, %2
-...
----
-name: fshr_vvv
-legalized: true
-
-body: |
- bb.0:
- liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: fshr_vvv
- ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
- ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s32)
- %0:_(s32) = COPY $vgpr0
- %1:_(s32) = COPY $vgpr1
- %2:_(s32) = COPY $vgpr2
- %3:_(s32) = G_FSHR %0, %1, %2
-...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index d9158e3558395..acd3dc683fa86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -376,9 +376,9 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX6-NEXT: s_min_u32 s2, s3, s2
; GFX6-NEXT: s_add_i32 s1, s1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_lshr_b32 s0, s0, 24
+; GFX6-NEXT: s_lshl_b32 s1, s1, 8
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v2i8:
@@ -726,18 +726,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
; GFX6-NEXT: s_not_b32 s5, s3
-; GFX6-NEXT: s_min_u32 s4, s5, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
+; GFX6-NEXT: s_min_u32 s4, s5, s4
+; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_add_i32 s3, s3, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: s_lshl_b32 s1, s1, 8
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_lshl_b32 s0, s2, 16
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_lshl_b32 s0, s3, 24
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s3, 24
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v4i8:
@@ -2142,9 +2142,9 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
; GFX6-NEXT: s_min_u32 s2, s3, s2
; GFX6-NEXT: s_add_i32 s1, s1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_lshr_b32 s0, s0, 16
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v2i16:
@@ -2349,15 +2349,15 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
; GFX6-NEXT: s_lshl_b32 s4, s7, 16
; GFX6-NEXT: s_not_b32 s5, s3
; GFX6-NEXT: s_min_u32 s4, s5, s4
-; GFX6-NEXT: s_add_i32 s3, s3, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
+; GFX6-NEXT: s_add_i32 s3, s3, s4
+; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_lshr_b32 s2, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s3, 16
+; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v4i16:
@@ -2522,20 +2522,20 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX6-NEXT: s_add_i32 s4, s4, s6
; GFX6-NEXT: s_lshl_b32 s6, s11, 16
; GFX6-NEXT: s_not_b32 s7, s5
-; GFX6-NEXT: s_min_u32 s6, s7, s6
-; GFX6-NEXT: s_add_i32 s5, s5, s6
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
+; GFX6-NEXT: s_min_u32 s6, s7, s6
+; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
+; GFX6-NEXT: s_add_i32 s5, s5, s6
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshr_b32 s5, s5, 16
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
-; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s3, 16
+; GFX6-NEXT: s_lshr_b32 s4, s4, 16
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_or_b32 s2, s4, s2
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v6i16:
@@ -2730,24 +2730,24 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX6-NEXT: s_add_i32 s6, s6, s8
; GFX6-NEXT: s_lshl_b32 s8, s15, 16
; GFX6-NEXT: s_not_b32 s9, s7
-; GFX6-NEXT: s_min_u32 s8, s9, s8
-; GFX6-NEXT: s_add_i32 s7, s7, s8
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
+; GFX6-NEXT: s_min_u32 s8, s9, s8
+; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
+; GFX6-NEXT: s_add_i32 s7, s7, s8
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshr_b32 s5, s5, 16
; GFX6-NEXT: s_lshr_b32 s7, s7, 16
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s6
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
-; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16
-; GFX6-NEXT: v_alignbit_b32 v3, s7, v3, 16
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: v_readfirstlane_b32 s3, v3
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s3, 16
+; GFX6-NEXT: s_lshr_b32 s4, s4, 16
+; GFX6-NEXT: s_lshr_b32 s6, s6, 16
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_lshl_b32 s3, s7, 16
+; GFX6-NEXT: s_or_b32 s2, s4, s2
+; GFX6-NEXT: s_or_b32 s3, s6, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 1fd139b06417f..a7f3bc016948a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -368,9 +368,9 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX6-NEXT: s_min_u32 s2, s1, s2
; GFX6-NEXT: s_sub_i32 s1, s1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_lshr_b32 s0, s0, 24
+; GFX6-NEXT: s_lshl_b32 s1, s1, 8
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v2i8:
@@ -710,18 +710,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX6-NEXT: s_sub_i32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
-; GFX6-NEXT: s_min_u32 s4, s3, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
+; GFX6-NEXT: s_min_u32 s4, s3, s4
+; GFX6-NEXT: s_lshr_b32 s0, s0, 24
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_sub_i32 s3, s3, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: s_lshl_b32 s1, s1, 8
; GFX6-NEXT: s_lshr_b32 s3, s3, 24
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_lshl_b32 s0, s2, 16
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_lshl_b32 s0, s3, 24
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s3, 24
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v4i8:
@@ -2052,9 +2052,9 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
; GFX6-NEXT: s_min_u32 s2, s1, s2
; GFX6-NEXT: s_sub_i32 s1, s1, s2
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_lshr_b32 s0, s0, 16
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v2i16:
@@ -2247,15 +2247,15 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_lshl_b32 s4, s7, 16
; GFX6-NEXT: s_min_u32 s4, s3, s4
-; GFX6-NEXT: s_sub_i32 s3, s3, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
+; GFX6-NEXT: s_sub_i32 s3, s3, s4
+; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_lshr_b32 s2, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s3, 16
+; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v4i16:
@@ -2408,20 +2408,20 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
; GFX6-NEXT: s_sub_i32 s4, s4, s6
; GFX6-NEXT: s_lshl_b32 s5, s5, 16
; GFX6-NEXT: s_lshl_b32 s6, s11, 16
-; GFX6-NEXT: s_min_u32 s6, s5, s6
-; GFX6-NEXT: s_sub_i32 s5, s5, s6
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
+; GFX6-NEXT: s_min_u32 s6, s5, s6
+; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
+; GFX6-NEXT: s_sub_i32 s5, s5, s6
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshr_b32 s5, s5, 16
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
-; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s3, 16
+; GFX6-NEXT: s_lshr_b32 s4, s4, 16
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_or_b32 s2, s4, s2
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v6i16:
@@ -2600,24 +2600,24 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
; GFX6-NEXT: s_sub_i32 s6, s6, s8
; GFX6-NEXT: s_lshl_b32 s7, s7, 16
; GFX6-NEXT: s_lshl_b32 s8, s15, 16
-; GFX6-NEXT: s_min_u32 s8, s7, s8
-; GFX6-NEXT: s_sub_i32 s7, s7, s8
; GFX6-NEXT: s_lshr_b32 s1, s1, 16
+; GFX6-NEXT: s_min_u32 s8, s7, s8
+; GFX6-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-NEXT: s_lshr_b32 s3, s3, 16
+; GFX6-NEXT: s_sub_i32 s7, s7, s8
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_lshr_b32 s2, s2, 16
; GFX6-NEXT: s_lshr_b32 s5, s5, 16
; GFX6-NEXT: s_lshr_b32 s7, s7, 16
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s6
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16
-; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16
-; GFX6-NEXT: v_alignbit_b32 v3, s7, v3, 16
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: v_readfirstlane_b32 s3, v3
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s3, 16
+; GFX6-NEXT: s_lshr_b32 s4, s4, 16
+; GFX6-NEXT: s_lshr_b32 s6, s6, 16
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_lshl_b32 s3, s7, 16
+; GFX6-NEXT: s_or_b32 s2, s4, s2
+; GFX6-NEXT: s_or_b32 s3, s6, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 0d5f538215f18..17924629fef84 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -22563,36 +22563,36 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
@@ -22668,118 +22668,118 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9
-; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7
-; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6
-; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5
-; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4
-; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3
-; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2
-; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9
+; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7
+; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5
+; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4
+; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4
+; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3
+; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2
+; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
@@ -22822,7 +22822,7 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
@@ -22831,62 +22831,62 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
@@ -22906,165 +22906,180 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14
; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15
; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9
-; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7
-; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6
-; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5
-; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4
-; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3
-; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2
-; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9
+; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7
+; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5
+; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4
+; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4
+; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3
+; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2
+; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: .LBB16_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23072,10 +23087,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23083,10 +23099,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23094,10 +23111,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23105,10 +23123,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23116,10 +23135,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23127,10 +23147,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23138,10 +23159,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23149,10 +23171,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23160,10 +23183,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23171,10 +23195,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23182,10 +23207,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23193,10 +23219,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23204,10 +23231,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23215,10 +23243,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23226,19 +23255,21 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23247,10 +23278,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
@@ -23509,138 +23541,138 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s17, 0xffff0000
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v21, s4, 1
-; SI-NEXT: s_lshl_b32 s4, s17, 16
; SI-NEXT: v_writelane_b32 v21, s4, 0
+; SI-NEXT: s_lshl_b32 s4, s17, 16
+; SI-NEXT: v_writelane_b32 v21, s4, 1
; SI-NEXT: s_and_b32 s4, s16, 0xffff0000
; SI-NEXT: v_writelane_b32 v21, s4, 2
; SI-NEXT: s_lshl_b32 s4, s16, 16
-; SI-NEXT: s_and_b32 s11, s9, 0xffff0000
-; SI-NEXT: s_lshl_b32 s10, s9, 16
-; SI-NEXT: s_and_b32 s13, s8, 0xffff0000
-; SI-NEXT: s_lshl_b32 s12, s8, 16
-; SI-NEXT: s_and_b32 s15, s7, 0xffff0000
-; SI-NEXT: s_lshl_b32 s14, s7, 16
-; SI-NEXT: s_and_b32 s41, s6, 0xffff0000
-; SI-NEXT: s_lshl_b32 s40, s6, 16
-; SI-NEXT: s_and_b32 s43, s99, 0xffff0000
-; SI-NEXT: s_lshl_b32 s42, s99, 16
-; SI-NEXT: s_and_b32 s45, s98, 0xffff0000
-; SI-NEXT: s_lshl_b32 s44, s98, 16
-; SI-NEXT: s_and_b32 s47, s97, 0xffff0000
-; SI-NEXT: s_lshl_b32 s46, s97, 16
-; SI-NEXT: s_and_b32 s57, s96, 0xffff0000
-; SI-NEXT: s_lshl_b32 s56, s96, 16
-; SI-NEXT: s_and_b32 s59, s87, 0xffff0000
-; SI-NEXT: s_lshl_b32 s58, s87, 16
-; SI-NEXT: s_and_b32 s61, s86, 0xffff0000
-; SI-NEXT: s_lshl_b32 s60, s86, 16
-; SI-NEXT: s_and_b32 s63, s85, 0xffff0000
-; SI-NEXT: s_lshl_b32 s62, s85, 16
-; SI-NEXT: s_and_b32 s73, s84, 0xffff0000
-; SI-NEXT: s_lshl_b32 s72, s84, 16
-; SI-NEXT: s_and_b32 s75, s83, 0xffff0000
-; SI-NEXT: s_lshl_b32 s74, s83, 16
-; SI-NEXT: s_and_b32 s77, s82, 0xffff0000
-; SI-NEXT: s_lshl_b32 s76, s82, 16
-; SI-NEXT: s_and_b32 s79, s81, 0xffff0000
-; SI-NEXT: s_lshl_b32 s78, s81, 16
-; SI-NEXT: s_and_b32 s89, s80, 0xffff0000
-; SI-NEXT: s_lshl_b32 s88, s80, 16
-; SI-NEXT: s_and_b32 s91, s71, 0xffff0000
-; SI-NEXT: s_lshl_b32 s90, s71, 16
-; SI-NEXT: s_and_b32 s93, s70, 0xffff0000
-; SI-NEXT: s_lshl_b32 s92, s70, 16
-; SI-NEXT: s_and_b32 s95, s29, 0xffff0000
-; SI-NEXT: s_lshl_b32 s94, s29, 16
-; SI-NEXT: s_and_b32 s31, s28, 0xffff0000
-; SI-NEXT: s_lshl_b32 s30, s28, 16
-; SI-NEXT: s_and_b32 s35, s27, 0xffff0000
-; SI-NEXT: s_lshl_b32 s34, s27, 16
-; SI-NEXT: s_and_b32 s37, s26, 0xffff0000
-; SI-NEXT: s_lshl_b32 s36, s26, 16
-; SI-NEXT: s_and_b32 s39, s25, 0xffff0000
-; SI-NEXT: s_lshl_b32 s38, s25, 16
-; SI-NEXT: s_and_b32 s49, s24, 0xffff0000
-; SI-NEXT: s_lshl_b32 s48, s24, 16
-; SI-NEXT: s_and_b32 s51, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s50, s23, 16
-; SI-NEXT: s_and_b32 s53, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s52, s22, 16
-; SI-NEXT: s_and_b32 s55, s21, 0xffff0000
-; SI-NEXT: s_lshl_b32 s54, s21, 16
-; SI-NEXT: s_and_b32 s65, s20, 0xffff0000
-; SI-NEXT: s_lshl_b32 s64, s20, 16
-; SI-NEXT: s_and_b32 s67, s19, 0xffff0000
-; SI-NEXT: s_lshl_b32 s66, s19, 16
-; SI-NEXT: s_and_b32 s69, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s68, s18, 16
+; SI-NEXT: s_and_b32 s10, s9, 0xffff0000
+; SI-NEXT: s_lshl_b32 s11, s9, 16
+; SI-NEXT: s_and_b32 s12, s8, 0xffff0000
+; SI-NEXT: s_lshl_b32 s13, s8, 16
+; SI-NEXT: s_and_b32 s14, s7, 0xffff0000
+; SI-NEXT: s_lshl_b32 s15, s7, 16
+; SI-NEXT: s_and_b32 s40, s6, 0xffff0000
+; SI-NEXT: s_lshl_b32 s41, s6, 16
+; SI-NEXT: s_and_b32 s42, s99, 0xffff0000
+; SI-NEXT: s_lshl_b32 s43, s99, 16
+; SI-NEXT: s_and_b32 s44, s98, 0xffff0000
+; SI-NEXT: s_lshl_b32 s45, s98, 16
+; SI-NEXT: s_and_b32 s46, s97, 0xffff0000
+; SI-NEXT: s_lshl_b32 s47, s97, 16
+; SI-NEXT: s_and_b32 s56, s96, 0xffff0000
+; SI-NEXT: s_lshl_b32 s57, s96, 16
+; SI-NEXT: s_and_b32 s58, s87, 0xffff0000
+; SI-NEXT: s_lshl_b32 s59, s87, 16
+; SI-NEXT: s_and_b32 s60, s86, 0xffff0000
+; SI-NEXT: s_lshl_b32 s61, s86, 16
+; SI-NEXT: s_and_b32 s62, s85, 0xffff0000
+; SI-NEXT: s_lshl_b32 s63, s85, 16
+; SI-NEXT: s_and_b32 s72, s84, 0xffff0000
+; SI-NEXT: s_lshl_b32 s73, s84, 16
+; SI-NEXT: s_and_b32 s74, s83, 0xffff0000
+; SI-NEXT: s_lshl_b32 s75, s83, 16
+; SI-NEXT: s_and_b32 s76, s82, 0xffff0000
+; SI-NEXT: s_lshl_b32 s77, s82, 16
+; SI-NEXT: s_and_b32 s78, s81, 0xffff0000
+; SI-NEXT: s_lshl_b32 s79, s81, 16
+; SI-NEXT: s_and_b32 s88, s80, 0xffff0000
+; SI-NEXT: s_lshl_b32 s89, s80, 16
+; SI-NEXT: s_and_b32 s90, s71, 0xffff0000
+; SI-NEXT: s_lshl_b32 s91, s71, 16
+; SI-NEXT: s_and_b32 s92, s70, 0xffff0000
+; SI-NEXT: s_lshl_b32 s93, s70, 16
+; SI-NEXT: s_and_b32 s94, s29, 0xffff0000
+; SI-NEXT: s_lshl_b32 s95, s29, 16
+; SI-NEXT: s_and_b32 s30, s28, 0xffff0000
+; SI-NEXT: s_lshl_b32 s31, s28, 16
+; SI-NEXT: s_and_b32 s34, s27, 0xffff0000
+; SI-NEXT: s_lshl_b32 s35, s27, 16
+; SI-NEXT: s_and_b32 s36, s26, 0xffff0000
+; SI-NEXT: s_lshl_b32 s37, s26, 16
+; SI-NEXT: s_and_b32 s38, s25, 0xffff0000
+; SI-NEXT: s_lshl_b32 s39, s25, 16
+; SI-NEXT: s_and_b32 s48, s24, 0xffff0000
+; SI-NEXT: s_lshl_b32 s49, s24, 16
+; SI-NEXT: s_and_b32 s50, s23, 0xffff0000
+; SI-NEXT: s_lshl_b32 s51, s23, 16
+; SI-NEXT: s_and_b32 s52, s22, 0xffff0000
+; SI-NEXT: s_lshl_b32 s53, s22, 16
+; SI-NEXT: s_and_b32 s54, s21, 0xffff0000
+; SI-NEXT: s_lshl_b32 s55, s21, 16
+; SI-NEXT: s_and_b32 s64, s20, 0xffff0000
+; SI-NEXT: s_lshl_b32 s65, s20, 16
+; SI-NEXT: s_and_b32 s66, s19, 0xffff0000
+; SI-NEXT: s_lshl_b32 s67, s19, 16
+; SI-NEXT: s_and_b32 s68, s18, 0xffff0000
+; SI-NEXT: s_lshl_b32 s69, s18, 16
; SI-NEXT: v_writelane_b32 v21, s4, 3
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB17_3
; SI-NEXT: .LBB17_2:
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; kill: killed $sgpr4
-; SI-NEXT: ; implicit-def: $sgpr68
; SI-NEXT: ; implicit-def: $sgpr69
-; SI-NEXT: ; implicit-def: $sgpr66
+; SI-NEXT: ; implicit-def: $sgpr68
; SI-NEXT: ; implicit-def: $sgpr67
-; SI-NEXT: ; implicit-def: $sgpr64
+; SI-NEXT: ; implicit-def: $sgpr66
; SI-NEXT: ; implicit-def: $sgpr65
-; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: ; implicit-def: $sgpr64
; SI-NEXT: ; implicit-def: $sgpr55
-; SI-NEXT: ; implicit-def: $sgpr52
+; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; implicit-def: $sgpr53
-; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: ; implicit-def: $sgpr52
; SI-NEXT: ; implicit-def: $sgpr51
-; SI-NEXT: ; implicit-def: $sgpr48
+; SI-NEXT: ; implicit-def: $sgpr50
; SI-NEXT: ; implicit-def: $sgpr49
-; SI-NEXT: ; implicit-def: $sgpr38
+; SI-NEXT: ; implicit-def: $sgpr48
; SI-NEXT: ; implicit-def: $sgpr39
-; SI-NEXT: ; implicit-def: $sgpr36
+; SI-NEXT: ; implicit-def: $sgpr38
; SI-NEXT: ; implicit-def: $sgpr37
-; SI-NEXT: ; implicit-def: $sgpr34
+; SI-NEXT: ; implicit-def: $sgpr36
; SI-NEXT: ; implicit-def: $sgpr35
-; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; implicit-def: $sgpr34
; SI-NEXT: ; implicit-def: $sgpr31
-; SI-NEXT: ; implicit-def: $sgpr94
+; SI-NEXT: ; implicit-def: $sgpr30
; SI-NEXT: ; implicit-def: $sgpr95
-; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr94
; SI-NEXT: ; implicit-def: $sgpr93
-; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr92
; SI-NEXT: ; implicit-def: $sgpr91
-; SI-NEXT: ; implicit-def: $sgpr88
+; SI-NEXT: ; implicit-def: $sgpr90
; SI-NEXT: ; implicit-def: $sgpr89
-; SI-NEXT: ; implicit-def: $sgpr78
+; SI-NEXT: ; implicit-def: $sgpr88
; SI-NEXT: ; implicit-def: $sgpr79
-; SI-NEXT: ; implicit-def: $sgpr76
+; SI-NEXT: ; implicit-def: $sgpr78
; SI-NEXT: ; implicit-def: $sgpr77
-; SI-NEXT: ; implicit-def: $sgpr74
+; SI-NEXT: ; implicit-def: $sgpr76
; SI-NEXT: ; implicit-def: $sgpr75
-; SI-NEXT: ; implicit-def: $sgpr72
+; SI-NEXT: ; implicit-def: $sgpr74
; SI-NEXT: ; implicit-def: $sgpr73
-; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr72
; SI-NEXT: ; implicit-def: $sgpr63
-; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: ; implicit-def: $sgpr62
; SI-NEXT: ; implicit-def: $sgpr61
-; SI-NEXT: ; implicit-def: $sgpr58
+; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; implicit-def: $sgpr59
-; SI-NEXT: ; implicit-def: $sgpr56
+; SI-NEXT: ; implicit-def: $sgpr58
; SI-NEXT: ; implicit-def: $sgpr57
-; SI-NEXT: ; implicit-def: $sgpr46
+; SI-NEXT: ; implicit-def: $sgpr56
; SI-NEXT: ; implicit-def: $sgpr47
-; SI-NEXT: ; implicit-def: $sgpr44
+; SI-NEXT: ; implicit-def: $sgpr46
; SI-NEXT: ; implicit-def: $sgpr45
-; SI-NEXT: ; implicit-def: $sgpr42
+; SI-NEXT: ; implicit-def: $sgpr44
; SI-NEXT: ; implicit-def: $sgpr43
-; SI-NEXT: ; implicit-def: $sgpr40
+; SI-NEXT: ; implicit-def: $sgpr42
; SI-NEXT: ; implicit-def: $sgpr41
-; SI-NEXT: ; implicit-def: $sgpr14
+; SI-NEXT: ; implicit-def: $sgpr40
; SI-NEXT: ; implicit-def: $sgpr15
-; SI-NEXT: ; implicit-def: $sgpr12
+; SI-NEXT: ; implicit-def: $sgpr14
; SI-NEXT: ; implicit-def: $sgpr13
-; SI-NEXT: ; implicit-def: $sgpr10
+; SI-NEXT: ; implicit-def: $sgpr12
; SI-NEXT: ; implicit-def: $sgpr11
+; SI-NEXT: ; implicit-def: $sgpr10
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
@@ -23677,8 +23709,11 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: s_mov_b32 s72, s74
; SI-NEXT: s_mov_b32 s73, s75
; SI-NEXT: s_mov_b32 s74, s76
-; SI-NEXT: v_readlane_b32 s75, v21, 0
-; SI-NEXT: v_readlane_b32 s76, v21, 1
+; SI-NEXT: s_mov_b32 s75, s77
+; SI-NEXT: s_mov_b32 s76, s78
+; SI-NEXT: s_mov_b32 s77, s79
+; SI-NEXT: v_readlane_b32 s78, v21, 0
+; SI-NEXT: v_readlane_b32 s79, v21, 1
; SI-NEXT: s_cbranch_vccnz .LBB17_5
; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: s_add_i32 s16, s16, 3
@@ -23713,296 +23748,328 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a
; SI-NEXT: s_add_i32 s7, s7, 3
; SI-NEXT: s_add_i32 s8, s8, 3
; SI-NEXT: s_add_i32 s9, s9, 3
-; SI-NEXT: s_and_b32 s15, s6, 0xffff0000
-; SI-NEXT: s_lshl_b32 s14, s6, 16
+; SI-NEXT: s_and_b32 s14, s6, 0xffff0000
+; SI-NEXT: s_lshl_b32 s15, s6, 16
; SI-NEXT: s_and_b32 s6, s16, 0xffff0000
-; SI-NEXT: s_and_b32 s5, s9, 0xffff0000
-; SI-NEXT: s_lshl_b32 s4, s9, 16
-; SI-NEXT: s_and_b32 s11, s8, 0xffff0000
-; SI-NEXT: s_lshl_b32 s10, s8, 16
-; SI-NEXT: s_and_b32 s13, s7, 0xffff0000
-; SI-NEXT: s_lshl_b32 s12, s7, 16
-; SI-NEXT: s_and_b32 s41, s99, 0xffff0000
-; SI-NEXT: s_lshl_b32 s40, s99, 16
-; SI-NEXT: s_and_b32 s43, s98, 0xffff0000
-; SI-NEXT: s_lshl_b32 s42, s98, 16
-; SI-NEXT: s_and_b32 s45, s97, 0xffff0000
-; SI-NEXT: s_lshl_b32 s44, s97, 16
-; SI-NEXT: s_and_b32 s47, s96, 0xffff0000
-; SI-NEXT: s_lshl_b32 s46, s96, 16
-; SI-NEXT: s_and_b32 s57, s87, 0xffff0000
-; SI-NEXT: s_lshl_b32 s56, s87, 16
-; SI-NEXT: s_and_b32 s59, s86, 0xffff0000
-; SI-NEXT: s_lshl_b32 s58, s86, 16
-; SI-NEXT: s_and_b32 s61, s85, 0xffff0000
-; SI-NEXT: s_lshl_b32 s60, s85, 16
-; SI-NEXT: s_and_b32 s63, s84, 0xffff0000
-; SI-NEXT: s_lshl_b32 s62, s84, 16
-; SI-NEXT: s_and_b32 s73, s83, 0xffff0000
-; SI-NEXT: s_lshl_b32 s72, s83, 16
-; SI-NEXT: s_and_b32 s77, s82, 0xffff0000
-; SI-NEXT: s_lshl_b32 s74, s82, 16
-; SI-NEXT: s_and_b32 s79, s81, 0xffff0000
-; SI-NEXT: s_lshl_b32 s78, s81, 16
-; SI-NEXT: s_and_b32 s89, s80, 0xffff0000
-; SI-NEXT: s_lshl_b32 s88, s80, 16
-; SI-NEXT: s_and_b32 s91, s71, 0xffff0000
-; SI-NEXT: s_lshl_b32 s90, s71, 16
-; SI-NEXT: s_and_b32 s93, s70, 0xffff0000
-; SI-NEXT: s_lshl_b32 s92, s70, 16
-; SI-NEXT: s_and_b32 s95, s29, 0xffff0000
-; SI-NEXT: s_lshl_b32 s94, s29, 16
-; SI-NEXT: s_and_b32 s31, s28, 0xffff0000
-; SI-NEXT: s_lshl_b32 s30, s28, 16
-; SI-NEXT: s_and_b32 s35, s27, 0xffff0000
-; SI-NEXT: s_lshl_b32 s34, s27, 16
-; SI-NEXT: s_and_b32 s37, s26, 0xffff0000
-; SI-NEXT: s_lshl_b32 s36, s26, 16
-; SI-NEXT: s_and_b32 s39, s25, 0xffff0000
-; SI-NEXT: s_lshl_b32 s38, s25, 16
-; SI-NEXT: s_and_b32 s49, s24, 0xffff0000
-; SI-NEXT: s_lshl_b32 s48, s24, 16
-; SI-NEXT: s_and_b32 s51, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s50, s23, 16
-; SI-NEXT: s_and_b32 s53, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s52, s22, 16
-; SI-NEXT: s_and_b32 s55, s21, 0xffff0000
-; SI-NEXT: s_lshl_b32 s54, s21, 16
-; SI-NEXT: s_and_b32 s65, s20, 0xffff0000
-; SI-NEXT: s_lshl_b32 s64, s20, 16
-; SI-NEXT: s_and_b32 s67, s19, 0xffff0000
-; SI-NEXT: s_lshl_b32 s66, s19, 16
-; SI-NEXT: s_and_b32 s69, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s68, s18, 16
-; SI-NEXT: s_and_b32 s76, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s75, s17, 16
+; SI-NEXT: s_and_b32 s4, s9, 0xffff0000
+; SI-NEXT: s_lshl_b32 s5, s9, 16
+; SI-NEXT: s_and_b32 s10, s8, 0xffff0000
+; SI-NEXT: s_lshl_b32 s11, s8, 16
+; SI-NEXT: s_and_b32 s12, s7, 0xffff0000
+; SI-NEXT: s_lshl_b32 s13, s7, 16
+; SI-NEXT: s_and_b32 s40, s99, 0xffff0000
+; SI-NEXT: s_lshl_b32 s41, s99, 16
+; SI-NEXT: s_and_b32 s42, s98, 0xffff0000
+; SI-NEXT: s_lshl_b32 s43, s98, 16
+; SI-NEXT: s_and_b32 s44, s97, 0xffff0000
+; SI-NEXT: s_lshl_b32 s45, s97, 16
+; SI-NEXT: s_and_b32 s46, s96, 0xffff0000
+; SI-NEXT: s_lshl_b32 s47, s96, 16
+; SI-NEXT: s_and_b32 s56, s87, 0xffff0000
+; SI-NEXT: s_lshl_b32 s57, s87, 16
+; SI-NEXT: s_and_b32 s58, s86, 0xffff0000
+; SI-NEXT: s_lshl_b32 s59, s86, 16
+; SI-NEXT: s_and_b32 s60, s85, 0xffff0000
+; SI-NEXT: s_lshl_b32 s61, s85, 16
+; SI-NEXT: s_and_b32 s62, s84, 0xffff0000
+; SI-NEXT: s_lshl_b32 s63, s84, 16
+; SI-NEXT: s_and_b32 s72, s83, 0xffff0000
+; SI-NEXT: s_lshl_b32 s73, s83, 16
+; SI-NEXT: s_and_b32 s74, s82, 0xffff0000
+; SI-NEXT: s_lshl_b32 s75, s82, 16
+; SI-NEXT: s_and_b32 s76, s81, 0xffff0000
+; SI-NEXT: s_lshl_b32 s77, s81, 16
+; SI-NEXT: s_and_b32 s88, s80, 0xffff0000
+; SI-NEXT: s_lshl_b32 s89, s80, 16
+; SI-NEXT: s_and_b32 s90, s71, 0xffff0000
+; SI-NEXT: s_lshl_b32 s91, s71, 16
+; SI-NEXT: s_and_b32 s92, s70, 0xffff0000
+; SI-NEXT: s_lshl_b32 s93, s70, 16
+; SI-NEXT: s_and_b32 s94, s29, 0xffff0000
+; SI-NEXT: s_lshl_b32 s95, s29, 16
+; SI-NEXT: s_and_b32 s30, s28, 0xffff0000
+; SI-NEXT: s_lshl_b32 s31, s28, 16
+; SI-NEXT: s_and_b32 s34, s27, 0xffff0000
+; SI-NEXT: s_lshl_b32 s35, s27, 16
+; SI-NEXT: s_and_b32 s36, s26, 0xffff0000
+; SI-NEXT: s_lshl_b32 s37, s26, 16
+; SI-NEXT: s_and_b32 s38, s25, 0xffff0000
+; SI-NEXT: s_lshl_b32 s39, s25, 16
+; SI-NEXT: s_and_b32 s48, s24, 0xffff0000
+; SI-NEXT: s_lshl_b32 s49, s24, 16
+; SI-NEXT: s_and_b32 s50, s23, 0xffff0000
+; SI-NEXT: s_lshl_b32 s51, s23, 16
+; SI-NEXT: s_and_b32 s52, s22, 0xffff0000
+; SI-NEXT: s_lshl_b32 s53, s22, 16
+; SI-NEXT: s_and_b32 s54, s21, 0xffff0000
+; SI-NEXT: s_lshl_b32 s55, s21, 16
+; SI-NEXT: s_and_b32 s64, s20, 0xffff0000
+; SI-NEXT: s_lshl_b32 s65, s20, 16
+; SI-NEXT: s_and_b32 s66, s19, 0xffff0000
+; SI-NEXT: s_lshl_b32 s67, s19, 16
+; SI-NEXT: s_and_b32 s68, s18, 0xffff0000
+; SI-NEXT: s_lshl_b32 s69, s18, 16
+; SI-NEXT: s_and_b32 s78, s17, 0xffff0000
+; SI-NEXT: s_lshl_b32 s79, s17, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v21, s6, 2
; SI-NEXT: s_lshl_b32 s6, s16, 16
; SI-NEXT: v_writelane_b32 v21, s6, 3
; SI-NEXT: .LBB17_5: ; %end
-; SI-NEXT: v_readlane_b32 s6, v21, 2
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6
; SI-NEXT: v_readlane_b32 s6, v21, 3
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT: v_readlane_b32 s6, v21, 2
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: v_readlane_b32 s99, v20, 35
@@ -24310,213 +24377,224 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v42
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v45
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v46
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v47
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v6
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v5
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB18_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
@@ -24542,132 +24620,173 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16
-; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16
-; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16
-; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16
-; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16
-; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16
-; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16
-; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16
-; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16
-; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; kill: killed $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16
-; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v54
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v50
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v40
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v52
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
@@ -24713,263 +24832,315 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: .LBB18_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB18_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v40
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: .LBB18_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
@@ -25003,12 +25174,12 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB18_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -25020,14 +25191,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -25038,14 +25209,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v14, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -25056,14 +25227,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v13, v13, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -25074,14 +25245,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v12, v12, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -25092,14 +25263,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -25110,14 +25281,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v10, v10, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -25128,14 +25299,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v9, v9, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -25146,14 +25317,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v8, v8, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -25164,14 +25335,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -25182,14 +25353,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -25200,14 +25371,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -25218,14 +25389,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -25236,14 +25407,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -25254,14 +25425,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -25272,14 +25443,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -25290,15 +25461,15 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
@@ -25309,14 +25480,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31
; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
@@ -25327,14 +25498,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
@@ -25345,14 +25516,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29
; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v28
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
@@ -25363,14 +25534,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28
; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v27
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
@@ -25381,14 +25552,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27
; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v26
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
@@ -25399,14 +25570,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26
; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v25
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
@@ -25417,14 +25588,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25
; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
@@ -25435,14 +25606,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24
; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v23
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
@@ -25453,14 +25624,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23
; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v22
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
@@ -25471,14 +25642,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22
; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v22, v22, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v21
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
@@ -25489,14 +25660,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21
; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v21, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
@@ -25507,14 +25678,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20
; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v20, v20, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v19
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
@@ -25525,14 +25696,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19
; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v19, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v18
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
@@ -25543,14 +25714,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v18, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v17
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
@@ -25561,14 +25732,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v17, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v16
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
@@ -25579,8 +25750,8 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB18_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -27212,6 +27383,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-LABEL: bitcast_v64bf16_to_v32i32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
@@ -27228,533 +27400,621 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32
; SI-NEXT: s_waitcnt expcnt(6)
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36
; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40
; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v6, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v26
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; SI-NEXT: v_mov_b32_e32 v39, v10
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8
-; SI-NEXT: v_mov_b32_e32 v38, v12
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30
-; SI-NEXT: v_mov_b32_e32 v37, v14
-; SI-NEXT: v_mov_b32_e32 v14, v11
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29
-; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47
-; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_mov_b32_e32 v59, v2
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36
-; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16
-; SI-NEXT: v_mov_b32_e32 v57, v11
-; SI-NEXT: v_mov_b32_e32 v47, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16
-; SI-NEXT: v_mov_b32_e32 v33, v14
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_mov_b32_e32 v62, v38
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v50
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v48
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53
+; SI-NEXT: v_mov_b32_e32 v46, v38
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29
+; SI-NEXT: v_mov_b32_e32 v38, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v51
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v28
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_mov_b32_e32 v44, v37
+; SI-NEXT: v_mov_b32_e32 v37, v20
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v24
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v22
+; SI-NEXT: v_mov_b32_e32 v22, v26
+; SI-NEXT: v_mov_b32_e32 v58, v28
+; SI-NEXT: v_mov_b32_e32 v43, v23
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v61, v29
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v52, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32
+; SI-NEXT: v_mov_b32_e32 v47, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35
+; SI-NEXT: v_mov_b32_e32 v42, v49
+; SI-NEXT: v_mov_b32_e32 v45, v39
+; SI-NEXT: v_mov_b32_e32 v56, v48
+; SI-NEXT: v_mov_b32_e32 v57, v41
+; SI-NEXT: v_mov_b32_e32 v60, v40
+; SI-NEXT: v_mov_b32_e32 v59, v55
+; SI-NEXT: v_mov_b32_e32 v63, v54
+; SI-NEXT: v_mov_b32_e32 v36, v53
+; SI-NEXT: v_mov_b32_e32 v62, v51
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT: v_mov_b32_e32 v61, v53
-; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16
-; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16
-; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16
-; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16
-; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16
-; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16
-; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16
-; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16
-; SI-NEXT: v_mov_b32_e32 v50, v49
-; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16
-; SI-NEXT: v_mov_b32_e32 v48, v37
-; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16
-; SI-NEXT: v_mov_b32_e32 v37, v34
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mov_b32_e32 v35, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mov_b32_e32 v43, v8
-; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v42, v9
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
-; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v58, v11
-; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v56, v11
-; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v46, v12
-; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v63, v14
-; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v44, v14
-; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v36, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v53, v38
-; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16
-; SI-NEXT: v_mov_b32_e32 v38, v39
-; SI-NEXT: v_mov_b32_e32 v39, v41
-; SI-NEXT: v_mov_b32_e32 v41, v40
-; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: v_mov_b32_e32 v55, v54
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v38
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: s_cbranch_execnz .LBB19_3
; SI-NEXT: .LBB19_2: ; %cmp.true
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v57
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v58
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v47
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: .LBB19_3: ; %end
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -27772,41 +28032,28 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB19_4:
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v61, v53
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v59, v2
-; SI-NEXT: v_mov_b32_e32 v57, v11
-; SI-NEXT: v_mov_b32_e32 v47, v10
-; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_mov_b32_e32 v33, v14
-; SI-NEXT: v_mov_b32_e32 v62, v38
-; SI-NEXT: v_mov_b32_e32 v38, v39
-; SI-NEXT: v_mov_b32_e32 v39, v41
-; SI-NEXT: v_mov_b32_e32 v41, v40
-; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: v_mov_b32_e32 v55, v54
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v49
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_mov_b32_e32 v48, v37
-; SI-NEXT: v_mov_b32_e32 v37, v34
+; SI-NEXT: v_mov_b32_e32 v52, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
+; SI-NEXT: v_mov_b32_e32 v42, v49
+; SI-NEXT: v_mov_b32_e32 v44, v37
+; SI-NEXT: v_mov_b32_e32 v45, v39
+; SI-NEXT: v_mov_b32_e32 v46, v38
+; SI-NEXT: v_mov_b32_e32 v56, v48
+; SI-NEXT: v_mov_b32_e32 v57, v41
+; SI-NEXT: v_mov_b32_e32 v60, v40
+; SI-NEXT: v_mov_b32_e32 v59, v55
+; SI-NEXT: v_mov_b32_e32 v63, v54
+; SI-NEXT: v_mov_b32_e32 v36, v53
+; SI-NEXT: v_mov_b32_e32 v61, v29
+; SI-NEXT: v_mov_b32_e32 v62, v51
+; SI-NEXT: v_mov_b32_e32 v58, v28
+; SI-NEXT: v_mov_b32_e32 v47, v31
+; SI-NEXT: v_mov_b32_e32 v37, v20
+; SI-NEXT: v_mov_b32_e32 v33, v22
+; SI-NEXT: v_mov_b32_e32 v43, v23
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_branch .LBB19_2
;
@@ -27851,12 +28098,12 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB19_3
; VI-NEXT: .LBB19_2: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -27867,14 +28114,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -27885,14 +28132,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -27903,14 +28150,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -27921,14 +28168,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -27939,14 +28186,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -27957,14 +28204,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -27975,14 +28222,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -27993,14 +28240,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -28011,14 +28258,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -28029,14 +28276,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -28047,14 +28294,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -28065,14 +28312,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -28083,14 +28330,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -28101,14 +28348,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -28119,14 +28366,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -28137,14 +28384,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v31
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
@@ -28155,14 +28402,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31
; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v31, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v30
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
@@ -28173,14 +28420,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v30, v30, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v29
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
@@ -28191,14 +28438,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29
; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v29, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v28
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
@@ -28209,14 +28456,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28
; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v28, v28, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v27
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
@@ -28227,14 +28474,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27
; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v27, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v26
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
@@ -28245,14 +28492,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26
; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v26, v26, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v25
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
@@ -28263,14 +28510,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25
; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v25, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
@@ -28281,14 +28528,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24
; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v24, v24, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v23
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
@@ -28299,14 +28546,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23
; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v23, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v22
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
@@ -28317,14 +28564,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22
; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v22, v22, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
@@ -28335,14 +28582,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21
; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v21, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
@@ -28353,14 +28600,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20
; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v20, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
@@ -28371,14 +28618,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19
; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v19, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
@@ -28389,14 +28636,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v32, v32, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
@@ -28407,14 +28654,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
@@ -28425,8 +28672,8 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB19_3: ; %end
; VI-NEXT: v_mov_b32_e32 v18, v32
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -59360,36 +59607,36 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
@@ -59465,118 +59712,118 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9
-; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7
-; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6
-; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5
-; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4
-; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3
-; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2
-; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9
+; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7
+; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5
+; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4
+; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4
+; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3
+; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2
+; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
@@ -59619,7 +59866,7 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; SI-NEXT: v_add_f32_e32 v30, 1.0, v30
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v29, 1.0, v29
@@ -59628,62 +59875,62 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; SI-NEXT: v_add_f32_e32 v28, 1.0, v28
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; SI-NEXT: v_add_f32_e32 v27, 1.0, v27
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; SI-NEXT: v_add_f32_e32 v26, 1.0, v26
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; SI-NEXT: v_add_f32_e32 v25, 1.0, v25
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; SI-NEXT: v_add_f32_e32 v24, 1.0, v24
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; SI-NEXT: v_add_f32_e32 v23, 1.0, v23
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; SI-NEXT: v_add_f32_e32 v22, 1.0, v22
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; SI-NEXT: v_add_f32_e32 v21, 1.0, v21
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; SI-NEXT: v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; SI-NEXT: v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
@@ -59703,165 +59950,180 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v14, 1.0, v14
; SI-NEXT: v_add_f32_e32 v15, 1.0, v15
; SI-NEXT: v_add_f32_e32 v16, 1.0, v16
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9
-; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7
-; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6
-; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5
-; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4
-; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3
-; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2
-; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9
+; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7
+; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5
+; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4
+; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4
+; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3
+; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2
+; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: .LBB40_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59869,10 +60131,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59880,10 +60143,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59891,10 +60155,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59902,10 +60167,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59913,10 +60179,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59924,10 +60191,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59935,10 +60203,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59946,10 +60215,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59957,10 +60227,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59968,10 +60239,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59979,10 +60251,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -59990,10 +60263,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -60001,10 +60275,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -60012,10 +60287,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -60023,19 +60299,21 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -60044,10 +60322,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
@@ -60225,8 +60504,8 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_writelane_b32 v63, s30, 0
@@ -60372,109 +60651,107 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: s_lshl_b32 s59, s16, 16
; SI-NEXT: s_cbranch_execnz .LBB41_4
; SI-NEXT: .LBB41_2: ; %cmp.true
-; SI-NEXT: v_add_f32_e64 v2, s19, 1.0
-; SI-NEXT: v_add_f32_e64 v4, s47, 1.0
+; SI-NEXT: v_add_f32_e64 v5, s47, 1.0
; SI-NEXT: v_add_f32_e64 v1, s18, 1.0
-; SI-NEXT: v_add_f32_e64 v6, s46, 1.0
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2
-; SI-NEXT: v_add_f32_e64 v2, s17, 1.0
; SI-NEXT: v_add_f32_e64 v3, s20, 1.0
-; SI-NEXT: v_add_f32_e64 v45, s21, 1.0
-; SI-NEXT: v_add_f32_e64 v43, s22, 1.0
-; SI-NEXT: v_add_f32_e64 v41, s23, 1.0
-; SI-NEXT: v_add_f32_e64 v55, s24, 1.0
-; SI-NEXT: v_add_f32_e64 v53, s25, 1.0
-; SI-NEXT: v_add_f32_e64 v51, s26, 1.0
-; SI-NEXT: v_add_f32_e64 v49, s27, 1.0
-; SI-NEXT: v_add_f32_e64 v39, s28, 1.0
-; SI-NEXT: v_add_f32_e64 v37, s29, 1.0
-; SI-NEXT: v_add_f32_e64 v35, s6, 1.0
-; SI-NEXT: v_add_f32_e64 v33, s7, 1.0
-; SI-NEXT: v_add_f32_e64 v31, s8, 1.0
-; SI-NEXT: v_add_f32_e64 v29, s9, 1.0
-; SI-NEXT: v_add_f32_e64 v27, s10, 1.0
-; SI-NEXT: v_add_f32_e64 v25, s11, 1.0
-; SI-NEXT: v_add_f32_e64 v23, s12, 1.0
-; SI-NEXT: v_add_f32_e64 v21, s13, 1.0
-; SI-NEXT: v_add_f32_e64 v19, s14, 1.0
-; SI-NEXT: v_add_f32_e64 v17, s15, 1.0
-; SI-NEXT: v_add_f32_e64 v15, s40, 1.0
-; SI-NEXT: v_add_f32_e64 v13, s41, 1.0
-; SI-NEXT: v_add_f32_e64 v11, s42, 1.0
-; SI-NEXT: v_add_f32_e64 v9, s43, 1.0
-; SI-NEXT: v_add_f32_e64 v7, s44, 1.0
-; SI-NEXT: v_add_f32_e64 v5, s45, 1.0
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e64 v4, s46, 1.0
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_add_f32_e64 v2, s19, 1.0
+; SI-NEXT: v_add_f32_e64 v46, s21, 1.0
+; SI-NEXT: v_add_f32_e64 v44, s22, 1.0
+; SI-NEXT: v_add_f32_e64 v42, s23, 1.0
+; SI-NEXT: v_add_f32_e64 v40, s24, 1.0
+; SI-NEXT: v_add_f32_e64 v54, s25, 1.0
+; SI-NEXT: v_add_f32_e64 v52, s26, 1.0
+; SI-NEXT: v_add_f32_e64 v50, s27, 1.0
+; SI-NEXT: v_add_f32_e64 v48, s28, 1.0
+; SI-NEXT: v_add_f32_e64 v38, s29, 1.0
+; SI-NEXT: v_add_f32_e64 v36, s6, 1.0
+; SI-NEXT: v_add_f32_e64 v34, s7, 1.0
+; SI-NEXT: v_add_f32_e64 v32, s8, 1.0
+; SI-NEXT: v_add_f32_e64 v30, s9, 1.0
+; SI-NEXT: v_add_f32_e64 v28, s10, 1.0
+; SI-NEXT: v_add_f32_e64 v26, s11, 1.0
+; SI-NEXT: v_add_f32_e64 v24, s12, 1.0
+; SI-NEXT: v_add_f32_e64 v22, s13, 1.0
+; SI-NEXT: v_add_f32_e64 v20, s14, 1.0
+; SI-NEXT: v_add_f32_e64 v18, s15, 1.0
+; SI-NEXT: v_add_f32_e64 v16, s40, 1.0
+; SI-NEXT: v_add_f32_e64 v14, s41, 1.0
+; SI-NEXT: v_add_f32_e64 v12, s42, 1.0
+; SI-NEXT: v_add_f32_e64 v10, s43, 1.0
+; SI-NEXT: v_add_f32_e64 v8, s44, 1.0
+; SI-NEXT: v_add_f32_e64 v6, s45, 1.0
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v4, v8
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3
+; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1
+; SI-NEXT: v_add_f32_e64 v1, s17, 1.0
+; SI-NEXT: v_add_f32_e64 v3, s16, 1.0
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT: v_add_f32_e64 v2, s16, 1.0
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
-; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
-; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
-; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v33
-; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33
-; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
-; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v37
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37
-; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v39
-; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v49
-; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49
-; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51
-; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v53
-; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v55
-; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v41
-; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41
-; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v43
-; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43
-; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v45
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45
-; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v26
+; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
+; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
+; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v32
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
+; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v36
+; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v38
+; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38
+; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v48
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v50
+; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50
+; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v52
+; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v54
+; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v40
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v42
+; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
+; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v44
+; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44
+; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v46
+; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46
+; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2
+; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_branch .LBB41_5
; SI-NEXT: .LBB41_3:
; SI-NEXT: ; implicit-def: $sgpr4
@@ -60547,10 +60824,12 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: s_branch .LBB41_2
; SI-NEXT: .LBB41_4:
+; SI-NEXT: v_mov_b32_e32 v4, s85
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; SI-NEXT: v_readlane_b32 s4, v62, 0
-; SI-NEXT: v_mov_b32_e32 v4, s4
+; SI-NEXT: v_mov_b32_e32 v5, s4
; SI-NEXT: v_readlane_b32 s4, v62, 1
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v4, s4
; SI-NEXT: v_readlane_b32 s4, v62, 2
@@ -60558,281 +60837,314 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v4, s4
; SI-NEXT: v_readlane_b32 s4, v62, 3
-; SI-NEXT: v_mov_b32_e32 v2, s59
-; SI-NEXT: v_mov_b32_e32 v3, s58
-; SI-NEXT: v_mov_b32_e32 v61, s57
-; SI-NEXT: v_mov_b32_e32 v1, s56
-; SI-NEXT: v_mov_b32_e32 v59, s99
-; SI-NEXT: v_mov_b32_e32 v60, s98
-; SI-NEXT: v_mov_b32_e32 v57, s97
-; SI-NEXT: v_mov_b32_e32 v58, s96
-; SI-NEXT: v_mov_b32_e32 v47, s87
-; SI-NEXT: v_mov_b32_e32 v56, s86
-; SI-NEXT: v_mov_b32_e32 v45, s85
-; SI-NEXT: v_mov_b32_e32 v46, s84
-; SI-NEXT: v_mov_b32_e32 v43, s83
-; SI-NEXT: v_mov_b32_e32 v44, s82
-; SI-NEXT: v_mov_b32_e32 v41, s81
-; SI-NEXT: v_mov_b32_e32 v42, s80
-; SI-NEXT: v_mov_b32_e32 v55, s71
-; SI-NEXT: v_mov_b32_e32 v40, s70
-; SI-NEXT: v_mov_b32_e32 v53, s69
-; SI-NEXT: v_mov_b32_e32 v54, s68
-; SI-NEXT: v_mov_b32_e32 v51, s67
-; SI-NEXT: v_mov_b32_e32 v52, s66
-; SI-NEXT: v_mov_b32_e32 v49, s65
-; SI-NEXT: v_mov_b32_e32 v50, s64
-; SI-NEXT: v_mov_b32_e32 v39, s55
-; SI-NEXT: v_mov_b32_e32 v48, s54
-; SI-NEXT: v_mov_b32_e32 v37, s53
-; SI-NEXT: v_mov_b32_e32 v38, s52
-; SI-NEXT: v_mov_b32_e32 v35, s51
-; SI-NEXT: v_mov_b32_e32 v36, s50
-; SI-NEXT: v_mov_b32_e32 v33, s49
-; SI-NEXT: v_mov_b32_e32 v34, s48
-; SI-NEXT: v_mov_b32_e32 v31, s39
-; SI-NEXT: v_mov_b32_e32 v32, s38
-; SI-NEXT: v_mov_b32_e32 v29, s37
-; SI-NEXT: v_mov_b32_e32 v30, s36
-; SI-NEXT: v_mov_b32_e32 v27, s35
-; SI-NEXT: v_mov_b32_e32 v28, s34
-; SI-NEXT: v_mov_b32_e32 v25, s31
-; SI-NEXT: v_mov_b32_e32 v26, s30
-; SI-NEXT: v_mov_b32_e32 v23, s95
-; SI-NEXT: v_mov_b32_e32 v24, s94
-; SI-NEXT: v_mov_b32_e32 v21, s93
-; SI-NEXT: v_mov_b32_e32 v22, s92
-; SI-NEXT: v_mov_b32_e32 v19, s91
-; SI-NEXT: v_mov_b32_e32 v20, s90
-; SI-NEXT: v_mov_b32_e32 v17, s89
-; SI-NEXT: v_mov_b32_e32 v18, s88
-; SI-NEXT: v_mov_b32_e32 v15, s79
-; SI-NEXT: v_mov_b32_e32 v16, s78
-; SI-NEXT: v_mov_b32_e32 v13, s77
-; SI-NEXT: v_mov_b32_e32 v14, s76
-; SI-NEXT: v_mov_b32_e32 v11, s75
-; SI-NEXT: v_mov_b32_e32 v12, s74
-; SI-NEXT: v_mov_b32_e32 v9, s73
-; SI-NEXT: v_mov_b32_e32 v10, s72
-; SI-NEXT: v_mov_b32_e32 v7, s63
-; SI-NEXT: v_mov_b32_e32 v8, s62
-; SI-NEXT: v_mov_b32_e32 v5, s61
-; SI-NEXT: v_mov_b32_e32 v6, s60
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v4, s4
+; SI-NEXT: v_mov_b32_e32 v3, s59
+; SI-NEXT: v_mov_b32_e32 v2, s58
+; SI-NEXT: v_mov_b32_e32 v1, s57
+; SI-NEXT: v_mov_b32_e32 v61, s56
+; SI-NEXT: v_mov_b32_e32 v60, s99
+; SI-NEXT: v_mov_b32_e32 v59, s98
+; SI-NEXT: v_mov_b32_e32 v58, s97
+; SI-NEXT: v_mov_b32_e32 v57, s96
+; SI-NEXT: v_mov_b32_e32 v56, s87
+; SI-NEXT: v_mov_b32_e32 v47, s86
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: v_mov_b32_e32 v5, s60
+; SI-NEXT: v_mov_b32_e32 v6, s61
+; SI-NEXT: v_mov_b32_e32 v7, s62
+; SI-NEXT: v_mov_b32_e32 v8, s63
+; SI-NEXT: v_mov_b32_e32 v9, s72
+; SI-NEXT: v_mov_b32_e32 v10, s73
+; SI-NEXT: v_mov_b32_e32 v11, s74
+; SI-NEXT: v_mov_b32_e32 v12, s75
+; SI-NEXT: v_mov_b32_e32 v13, s76
+; SI-NEXT: v_mov_b32_e32 v14, s77
+; SI-NEXT: v_mov_b32_e32 v15, s78
+; SI-NEXT: v_mov_b32_e32 v16, s79
+; SI-NEXT: v_mov_b32_e32 v17, s88
+; SI-NEXT: v_mov_b32_e32 v18, s89
+; SI-NEXT: v_mov_b32_e32 v19, s90
+; SI-NEXT: v_mov_b32_e32 v20, s91
+; SI-NEXT: v_mov_b32_e32 v21, s92
+; SI-NEXT: v_mov_b32_e32 v22, s93
+; SI-NEXT: v_mov_b32_e32 v23, s94
+; SI-NEXT: v_mov_b32_e32 v24, s95
+; SI-NEXT: v_mov_b32_e32 v25, s30
+; SI-NEXT: v_mov_b32_e32 v26, s31
+; SI-NEXT: v_mov_b32_e32 v27, s34
+; SI-NEXT: v_mov_b32_e32 v28, s35
+; SI-NEXT: v_mov_b32_e32 v29, s36
+; SI-NEXT: v_mov_b32_e32 v30, s37
+; SI-NEXT: v_mov_b32_e32 v31, s38
+; SI-NEXT: v_mov_b32_e32 v32, s39
+; SI-NEXT: v_mov_b32_e32 v33, s48
+; SI-NEXT: v_mov_b32_e32 v34, s49
+; SI-NEXT: v_mov_b32_e32 v35, s50
+; SI-NEXT: v_mov_b32_e32 v36, s51
+; SI-NEXT: v_mov_b32_e32 v37, s52
+; SI-NEXT: v_mov_b32_e32 v38, s53
+; SI-NEXT: v_mov_b32_e32 v39, s54
+; SI-NEXT: v_mov_b32_e32 v48, s55
+; SI-NEXT: v_mov_b32_e32 v49, s64
+; SI-NEXT: v_mov_b32_e32 v50, s65
+; SI-NEXT: v_mov_b32_e32 v51, s66
+; SI-NEXT: v_mov_b32_e32 v52, s67
+; SI-NEXT: v_mov_b32_e32 v53, s68
+; SI-NEXT: v_mov_b32_e32 v54, s69
+; SI-NEXT: v_mov_b32_e32 v55, s70
+; SI-NEXT: v_mov_b32_e32 v40, s71
+; SI-NEXT: v_mov_b32_e32 v41, s80
+; SI-NEXT: v_mov_b32_e32 v42, s81
+; SI-NEXT: v_mov_b32_e32 v43, s82
+; SI-NEXT: v_mov_b32_e32 v44, s83
+; SI-NEXT: v_mov_b32_e32 v45, s84
; SI-NEXT: .LBB41_5: ; %end
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_readlane_b32 s99, v63, 35
; SI-NEXT: v_readlane_b32 s98, v63, 34
; SI-NEXT: v_readlane_b32 s97, v63, 33
@@ -60869,22 +61181,23 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: v_readlane_b32 s34, v63, 2
; SI-NEXT: v_readlane_b32 s31, v63, 1
; SI-NEXT: v_readlane_b32 s30, v63, 0
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -60901,8 +61214,8 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; SI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -61153,213 +61466,224 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v42
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v45
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v46
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v47
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v6
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v5
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB42_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
@@ -61385,132 +61709,173 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16
-; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16
-; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16
-; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16
-; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16
-; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16
-; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16
-; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16
-; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16
-; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; kill: killed $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16
-; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v54
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v50
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v40
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v52
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
@@ -61556,263 +61921,315 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: .LBB42_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB42_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v40
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: .LBB42_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
@@ -61846,12 +62263,12 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB42_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -61863,14 +62280,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -61881,14 +62298,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v14, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -61899,14 +62316,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v13, v13, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -61917,14 +62334,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v12, v12, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -61935,14 +62352,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -61953,14 +62370,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v10, v10, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -61971,14 +62388,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v9, v9, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -61989,14 +62406,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v8, v8, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -62007,14 +62424,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -62025,14 +62442,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -62043,14 +62460,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -62061,14 +62478,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -62079,14 +62496,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -62097,14 +62514,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -62115,14 +62532,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -62133,15 +62550,15 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
@@ -62152,14 +62569,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31
; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
@@ -62170,14 +62587,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
@@ -62188,14 +62605,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29
; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v28
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
@@ -62206,14 +62623,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28
; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v27
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
@@ -62224,14 +62641,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27
; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v26
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
@@ -62242,14 +62659,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26
; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v25
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
@@ -62260,14 +62677,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25
; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
@@ -62278,14 +62695,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24
; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v23
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
@@ -62296,14 +62713,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23
; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v22
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
@@ -62314,14 +62731,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22
; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v22, v22, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v21
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
@@ -62332,14 +62749,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21
; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v21, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
@@ -62350,14 +62767,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20
; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v20, v20, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v19
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
@@ -62368,14 +62785,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19
; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v19, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v18
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
@@ -62386,14 +62803,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v18, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v17
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
@@ -62404,14 +62821,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v17, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v16
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
@@ -62422,8 +62839,8 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB42_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -64055,6 +64472,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-LABEL: bitcast_v64bf16_to_v32f32_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
@@ -64071,533 +64489,621 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32
; SI-NEXT: s_waitcnt expcnt(6)
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36
; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40
; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v6, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v26
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; SI-NEXT: v_mov_b32_e32 v39, v10
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8
-; SI-NEXT: v_mov_b32_e32 v38, v12
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30
-; SI-NEXT: v_mov_b32_e32 v37, v14
-; SI-NEXT: v_mov_b32_e32 v14, v11
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29
-; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47
-; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB43_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_mov_b32_e32 v59, v2
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36
-; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16
-; SI-NEXT: v_mov_b32_e32 v57, v11
-; SI-NEXT: v_mov_b32_e32 v47, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16
-; SI-NEXT: v_mov_b32_e32 v33, v14
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_mov_b32_e32 v62, v38
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v50
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v48
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53
+; SI-NEXT: v_mov_b32_e32 v46, v38
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29
+; SI-NEXT: v_mov_b32_e32 v38, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v51
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v28
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_mov_b32_e32 v44, v37
+; SI-NEXT: v_mov_b32_e32 v37, v20
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v24
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v22
+; SI-NEXT: v_mov_b32_e32 v22, v26
+; SI-NEXT: v_mov_b32_e32 v58, v28
+; SI-NEXT: v_mov_b32_e32 v43, v23
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v61, v29
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v52, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32
+; SI-NEXT: v_mov_b32_e32 v47, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35
+; SI-NEXT: v_mov_b32_e32 v42, v49
+; SI-NEXT: v_mov_b32_e32 v45, v39
+; SI-NEXT: v_mov_b32_e32 v56, v48
+; SI-NEXT: v_mov_b32_e32 v57, v41
+; SI-NEXT: v_mov_b32_e32 v60, v40
+; SI-NEXT: v_mov_b32_e32 v59, v55
+; SI-NEXT: v_mov_b32_e32 v63, v54
+; SI-NEXT: v_mov_b32_e32 v36, v53
+; SI-NEXT: v_mov_b32_e32 v62, v51
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT: v_mov_b32_e32 v61, v53
-; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16
-; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16
-; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16
-; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16
-; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16
-; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16
-; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16
-; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16
-; SI-NEXT: v_mov_b32_e32 v50, v49
-; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16
-; SI-NEXT: v_mov_b32_e32 v48, v37
-; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16
-; SI-NEXT: v_mov_b32_e32 v37, v34
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mov_b32_e32 v35, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mov_b32_e32 v43, v8
-; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v42, v9
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
-; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v58, v11
-; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v56, v11
-; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v46, v12
-; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v63, v14
-; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v44, v14
-; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v36, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v53, v38
-; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16
-; SI-NEXT: v_mov_b32_e32 v38, v39
-; SI-NEXT: v_mov_b32_e32 v39, v41
-; SI-NEXT: v_mov_b32_e32 v41, v40
-; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: v_mov_b32_e32 v55, v54
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v38
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: s_cbranch_execnz .LBB43_3
; SI-NEXT: .LBB43_2: ; %cmp.true
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v57
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v58
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v47
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: .LBB43_3: ; %end
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -64615,41 +65121,28 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB43_4:
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v61, v53
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v59, v2
-; SI-NEXT: v_mov_b32_e32 v57, v11
-; SI-NEXT: v_mov_b32_e32 v47, v10
-; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_mov_b32_e32 v33, v14
-; SI-NEXT: v_mov_b32_e32 v62, v38
-; SI-NEXT: v_mov_b32_e32 v38, v39
-; SI-NEXT: v_mov_b32_e32 v39, v41
-; SI-NEXT: v_mov_b32_e32 v41, v40
-; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: v_mov_b32_e32 v55, v54
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v49
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_mov_b32_e32 v48, v37
-; SI-NEXT: v_mov_b32_e32 v37, v34
+; SI-NEXT: v_mov_b32_e32 v52, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
+; SI-NEXT: v_mov_b32_e32 v42, v49
+; SI-NEXT: v_mov_b32_e32 v44, v37
+; SI-NEXT: v_mov_b32_e32 v45, v39
+; SI-NEXT: v_mov_b32_e32 v46, v38
+; SI-NEXT: v_mov_b32_e32 v56, v48
+; SI-NEXT: v_mov_b32_e32 v57, v41
+; SI-NEXT: v_mov_b32_e32 v60, v40
+; SI-NEXT: v_mov_b32_e32 v59, v55
+; SI-NEXT: v_mov_b32_e32 v63, v54
+; SI-NEXT: v_mov_b32_e32 v36, v53
+; SI-NEXT: v_mov_b32_e32 v61, v29
+; SI-NEXT: v_mov_b32_e32 v62, v51
+; SI-NEXT: v_mov_b32_e32 v58, v28
+; SI-NEXT: v_mov_b32_e32 v47, v31
+; SI-NEXT: v_mov_b32_e32 v37, v20
+; SI-NEXT: v_mov_b32_e32 v33, v22
+; SI-NEXT: v_mov_b32_e32 v43, v23
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_branch .LBB43_2
;
@@ -64694,12 +65187,12 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB43_3
; VI-NEXT: .LBB43_2: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -64710,14 +65203,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -64728,14 +65221,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -64746,14 +65239,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -64764,14 +65257,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -64782,14 +65275,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -64800,14 +65293,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -64818,14 +65311,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -64836,14 +65329,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -64854,14 +65347,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -64872,14 +65365,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -64890,14 +65383,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -64908,14 +65401,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -64926,14 +65419,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -64944,14 +65437,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -64962,14 +65455,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -64980,14 +65473,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v31
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
@@ -64998,14 +65491,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31
; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v31, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v30
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
@@ -65016,14 +65509,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v30, v30, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v29
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
@@ -65034,14 +65527,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29
; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v29, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v28
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
@@ -65052,14 +65545,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28
; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v28, v28, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v27
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
@@ -65070,14 +65563,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27
; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v27, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v26
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
@@ -65088,14 +65581,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26
; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v26, v26, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v25
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
@@ -65106,14 +65599,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25
; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v25, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
@@ -65124,14 +65617,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24
; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v24, v24, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v23
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
@@ -65142,14 +65635,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23
; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v23, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v22
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
@@ -65160,14 +65653,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22
; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v22, v22, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
@@ -65178,14 +65671,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21
; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v21, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
@@ -65196,14 +65689,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20
; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v20, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
@@ -65214,14 +65707,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19
; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v19, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
@@ -65232,14 +65725,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v32, v32, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
@@ -65250,14 +65743,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
@@ -65268,8 +65761,8 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB43_3: ; %end
; VI-NEXT: v_mov_b32_e32 v18, v32
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -94210,36 +94703,36 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32
-; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: ; implicit-def: $vgpr31
@@ -94315,118 +94808,118 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9
-; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7
-; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6
-; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5
-; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4
-; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3
-; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2
-; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9
+; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7
+; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5
+; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4
+; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4
+; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3
+; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2
+; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
@@ -94499,220 +94992,235 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_addc_u32_e32 v32, vcc, 0, v62, vcc
; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11
-; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9
-; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7
-; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6
-; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5
-; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4
-; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3
-; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2
-; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9
+; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7
+; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5
+; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4
+; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4
+; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3
+; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2
+; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: .LBB60_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94720,10 +95228,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94731,10 +95240,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94742,10 +95252,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94753,10 +95264,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94764,10 +95276,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94775,10 +95288,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94786,10 +95300,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94797,10 +95312,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94808,10 +95324,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94819,10 +95336,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94830,10 +95348,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94841,10 +95360,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94852,10 +95372,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94863,10 +95384,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94874,19 +95396,21 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -94895,10 +95419,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
@@ -95172,66 +95697,66 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
; SI-NEXT: v_writelane_b32 v21, s4, 2
; SI-NEXT: s_lshl_b32 s4, s8, 16
; SI-NEXT: v_writelane_b32 v21, s4, 3
-; SI-NEXT: s_and_b32 s11, s7, 0xffff0000
-; SI-NEXT: s_lshl_b32 s10, s7, 16
-; SI-NEXT: s_and_b32 s13, s6, 0xffff0000
-; SI-NEXT: s_lshl_b32 s12, s6, 16
-; SI-NEXT: s_and_b32 s15, s99, 0xffff0000
-; SI-NEXT: s_lshl_b32 s14, s99, 16
-; SI-NEXT: s_and_b32 s41, s98, 0xffff0000
-; SI-NEXT: s_lshl_b32 s40, s98, 16
-; SI-NEXT: s_and_b32 s43, s97, 0xffff0000
-; SI-NEXT: s_lshl_b32 s42, s97, 16
-; SI-NEXT: s_and_b32 s45, s96, 0xffff0000
-; SI-NEXT: s_lshl_b32 s44, s96, 16
-; SI-NEXT: s_and_b32 s47, s87, 0xffff0000
-; SI-NEXT: s_lshl_b32 s46, s87, 16
-; SI-NEXT: s_and_b32 s57, s86, 0xffff0000
-; SI-NEXT: s_lshl_b32 s56, s86, 16
-; SI-NEXT: s_and_b32 s59, s85, 0xffff0000
-; SI-NEXT: s_lshl_b32 s58, s85, 16
-; SI-NEXT: s_and_b32 s61, s84, 0xffff0000
-; SI-NEXT: s_lshl_b32 s60, s84, 16
-; SI-NEXT: s_and_b32 s63, s83, 0xffff0000
-; SI-NEXT: s_lshl_b32 s62, s83, 16
-; SI-NEXT: s_and_b32 s73, s82, 0xffff0000
-; SI-NEXT: s_lshl_b32 s72, s82, 16
-; SI-NEXT: s_and_b32 s75, s81, 0xffff0000
-; SI-NEXT: s_lshl_b32 s74, s81, 16
-; SI-NEXT: s_and_b32 s77, s80, 0xffff0000
-; SI-NEXT: s_lshl_b32 s76, s80, 16
-; SI-NEXT: s_and_b32 s79, s71, 0xffff0000
-; SI-NEXT: s_lshl_b32 s78, s71, 16
-; SI-NEXT: s_and_b32 s89, s70, 0xffff0000
-; SI-NEXT: s_lshl_b32 s88, s70, 16
-; SI-NEXT: s_and_b32 s91, s29, 0xffff0000
-; SI-NEXT: s_lshl_b32 s90, s29, 16
-; SI-NEXT: s_and_b32 s93, s28, 0xffff0000
-; SI-NEXT: s_lshl_b32 s92, s28, 16
-; SI-NEXT: s_and_b32 s95, s27, 0xffff0000
-; SI-NEXT: s_lshl_b32 s94, s27, 16
-; SI-NEXT: s_and_b32 s31, s26, 0xffff0000
-; SI-NEXT: s_lshl_b32 s30, s26, 16
-; SI-NEXT: s_and_b32 s35, s25, 0xffff0000
-; SI-NEXT: s_lshl_b32 s34, s25, 16
-; SI-NEXT: s_and_b32 s37, s24, 0xffff0000
-; SI-NEXT: s_lshl_b32 s36, s24, 16
-; SI-NEXT: s_and_b32 s39, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s38, s23, 16
-; SI-NEXT: s_and_b32 s49, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s48, s22, 16
-; SI-NEXT: s_and_b32 s51, s21, 0xffff0000
-; SI-NEXT: s_lshl_b32 s50, s21, 16
-; SI-NEXT: s_and_b32 s53, s20, 0xffff0000
-; SI-NEXT: s_lshl_b32 s52, s20, 16
-; SI-NEXT: s_and_b32 s55, s19, 0xffff0000
-; SI-NEXT: s_lshl_b32 s54, s19, 16
-; SI-NEXT: s_and_b32 s65, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s64, s18, 16
-; SI-NEXT: s_and_b32 s67, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s66, s17, 16
-; SI-NEXT: s_and_b32 s69, s16, 0xffff0000
-; SI-NEXT: s_lshl_b32 s68, s16, 16
+; SI-NEXT: s_and_b32 s10, s7, 0xffff0000
+; SI-NEXT: s_lshl_b32 s11, s7, 16
+; SI-NEXT: s_and_b32 s12, s6, 0xffff0000
+; SI-NEXT: s_lshl_b32 s13, s6, 16
+; SI-NEXT: s_and_b32 s14, s99, 0xffff0000
+; SI-NEXT: s_lshl_b32 s15, s99, 16
+; SI-NEXT: s_and_b32 s40, s98, 0xffff0000
+; SI-NEXT: s_lshl_b32 s41, s98, 16
+; SI-NEXT: s_and_b32 s42, s97, 0xffff0000
+; SI-NEXT: s_lshl_b32 s43, s97, 16
+; SI-NEXT: s_and_b32 s44, s96, 0xffff0000
+; SI-NEXT: s_lshl_b32 s45, s96, 16
+; SI-NEXT: s_and_b32 s46, s87, 0xffff0000
+; SI-NEXT: s_lshl_b32 s47, s87, 16
+; SI-NEXT: s_and_b32 s56, s86, 0xffff0000
+; SI-NEXT: s_lshl_b32 s57, s86, 16
+; SI-NEXT: s_and_b32 s58, s85, 0xffff0000
+; SI-NEXT: s_lshl_b32 s59, s85, 16
+; SI-NEXT: s_and_b32 s60, s84, 0xffff0000
+; SI-NEXT: s_lshl_b32 s61, s84, 16
+; SI-NEXT: s_and_b32 s62, s83, 0xffff0000
+; SI-NEXT: s_lshl_b32 s63, s83, 16
+; SI-NEXT: s_and_b32 s72, s82, 0xffff0000
+; SI-NEXT: s_lshl_b32 s73, s82, 16
+; SI-NEXT: s_and_b32 s74, s81, 0xffff0000
+; SI-NEXT: s_lshl_b32 s75, s81, 16
+; SI-NEXT: s_and_b32 s76, s80, 0xffff0000
+; SI-NEXT: s_lshl_b32 s77, s80, 16
+; SI-NEXT: s_and_b32 s78, s71, 0xffff0000
+; SI-NEXT: s_lshl_b32 s79, s71, 16
+; SI-NEXT: s_and_b32 s88, s70, 0xffff0000
+; SI-NEXT: s_lshl_b32 s89, s70, 16
+; SI-NEXT: s_and_b32 s90, s29, 0xffff0000
+; SI-NEXT: s_lshl_b32 s91, s29, 16
+; SI-NEXT: s_and_b32 s92, s28, 0xffff0000
+; SI-NEXT: s_lshl_b32 s93, s28, 16
+; SI-NEXT: s_and_b32 s94, s27, 0xffff0000
+; SI-NEXT: s_lshl_b32 s95, s27, 16
+; SI-NEXT: s_and_b32 s30, s26, 0xffff0000
+; SI-NEXT: s_lshl_b32 s31, s26, 16
+; SI-NEXT: s_and_b32 s34, s25, 0xffff0000
+; SI-NEXT: s_lshl_b32 s35, s25, 16
+; SI-NEXT: s_and_b32 s36, s24, 0xffff0000
+; SI-NEXT: s_lshl_b32 s37, s24, 16
+; SI-NEXT: s_and_b32 s38, s23, 0xffff0000
+; SI-NEXT: s_lshl_b32 s39, s23, 16
+; SI-NEXT: s_and_b32 s48, s22, 0xffff0000
+; SI-NEXT: s_lshl_b32 s49, s22, 16
+; SI-NEXT: s_and_b32 s50, s21, 0xffff0000
+; SI-NEXT: s_lshl_b32 s51, s21, 16
+; SI-NEXT: s_and_b32 s52, s20, 0xffff0000
+; SI-NEXT: s_lshl_b32 s53, s20, 16
+; SI-NEXT: s_and_b32 s54, s19, 0xffff0000
+; SI-NEXT: s_lshl_b32 s55, s19, 16
+; SI-NEXT: s_and_b32 s64, s18, 0xffff0000
+; SI-NEXT: s_lshl_b32 s65, s18, 16
+; SI-NEXT: s_and_b32 s66, s17, 0xffff0000
+; SI-NEXT: s_lshl_b32 s67, s17, 16
+; SI-NEXT: s_and_b32 s68, s16, 0xffff0000
+; SI-NEXT: s_lshl_b32 s69, s16, 16
; SI-NEXT: s_cbranch_execnz .LBB61_3
; SI-NEXT: .LBB61_2: ; %cmp.true
; SI-NEXT: s_add_u32 s4, s16, 3
@@ -95250,18 +95775,18 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
; SI-NEXT: s_addc_u32 s27, s29, 0
; SI-NEXT: s_add_u32 s28, s70, 3
; SI-NEXT: s_addc_u32 s29, s71, 0
-; SI-NEXT: s_add_u32 s76, s80, 3
-; SI-NEXT: s_addc_u32 s74, s81, 0
-; SI-NEXT: s_add_u32 s72, s82, 3
-; SI-NEXT: s_addc_u32 s62, s83, 0
-; SI-NEXT: s_add_u32 s60, s84, 3
-; SI-NEXT: s_addc_u32 s58, s85, 0
-; SI-NEXT: s_add_u32 s56, s86, 3
-; SI-NEXT: s_addc_u32 s46, s87, 0
-; SI-NEXT: s_add_u32 s44, s96, 3
-; SI-NEXT: s_addc_u32 s42, s97, 0
-; SI-NEXT: s_add_u32 s40, s98, 3
-; SI-NEXT: s_addc_u32 s14, s99, 0
+; SI-NEXT: s_add_u32 s77, s80, 3
+; SI-NEXT: s_addc_u32 s75, s81, 0
+; SI-NEXT: s_add_u32 s73, s82, 3
+; SI-NEXT: s_addc_u32 s63, s83, 0
+; SI-NEXT: s_add_u32 s61, s84, 3
+; SI-NEXT: s_addc_u32 s59, s85, 0
+; SI-NEXT: s_add_u32 s57, s86, 3
+; SI-NEXT: s_addc_u32 s47, s87, 0
+; SI-NEXT: s_add_u32 s45, s96, 3
+; SI-NEXT: s_addc_u32 s43, s97, 0
+; SI-NEXT: s_add_u32 s41, s98, 3
+; SI-NEXT: s_addc_u32 s15, s99, 0
; SI-NEXT: s_add_u32 s6, s6, 3
; SI-NEXT: s_addc_u32 s7, s7, 0
; SI-NEXT: s_add_u32 s8, s8, 3
@@ -95274,292 +95799,324 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
; SI-NEXT: s_and_b32 s9, s8, 0xffff0000
; SI-NEXT: v_writelane_b32 v21, s9, 2
; SI-NEXT: s_lshl_b32 s8, s8, 16
-; SI-NEXT: s_and_b32 s11, s7, 0xffff0000
-; SI-NEXT: s_lshl_b32 s10, s7, 16
-; SI-NEXT: s_and_b32 s13, s6, 0xffff0000
-; SI-NEXT: s_lshl_b32 s12, s6, 16
-; SI-NEXT: s_and_b32 s15, s14, 0xffff0000
-; SI-NEXT: s_lshl_b32 s14, s14, 16
-; SI-NEXT: s_and_b32 s41, s40, 0xffff0000
-; SI-NEXT: s_lshl_b32 s40, s40, 16
-; SI-NEXT: s_and_b32 s43, s42, 0xffff0000
-; SI-NEXT: s_lshl_b32 s42, s42, 16
-; SI-NEXT: s_and_b32 s45, s44, 0xffff0000
-; SI-NEXT: s_lshl_b32 s44, s44, 16
-; SI-NEXT: s_and_b32 s47, s46, 0xffff0000
-; SI-NEXT: s_lshl_b32 s46, s46, 16
-; SI-NEXT: s_and_b32 s57, s56, 0xffff0000
-; SI-NEXT: s_lshl_b32 s56, s56, 16
-; SI-NEXT: s_and_b32 s59, s58, 0xffff0000
-; SI-NEXT: s_lshl_b32 s58, s58, 16
-; SI-NEXT: s_and_b32 s61, s60, 0xffff0000
-; SI-NEXT: s_lshl_b32 s60, s60, 16
-; SI-NEXT: s_and_b32 s63, s62, 0xffff0000
-; SI-NEXT: s_lshl_b32 s62, s62, 16
-; SI-NEXT: s_and_b32 s73, s72, 0xffff0000
-; SI-NEXT: s_lshl_b32 s72, s72, 16
-; SI-NEXT: s_and_b32 s75, s74, 0xffff0000
-; SI-NEXT: s_lshl_b32 s74, s74, 16
-; SI-NEXT: s_and_b32 s77, s76, 0xffff0000
-; SI-NEXT: s_lshl_b32 s76, s76, 16
-; SI-NEXT: s_and_b32 s79, s29, 0xffff0000
-; SI-NEXT: s_lshl_b32 s78, s29, 16
-; SI-NEXT: s_and_b32 s89, s28, 0xffff0000
-; SI-NEXT: s_lshl_b32 s88, s28, 16
-; SI-NEXT: s_and_b32 s91, s27, 0xffff0000
-; SI-NEXT: s_lshl_b32 s90, s27, 16
-; SI-NEXT: s_and_b32 s93, s26, 0xffff0000
-; SI-NEXT: s_lshl_b32 s92, s26, 16
-; SI-NEXT: s_and_b32 s95, s25, 0xffff0000
-; SI-NEXT: s_lshl_b32 s94, s25, 16
-; SI-NEXT: s_and_b32 s31, s24, 0xffff0000
-; SI-NEXT: s_lshl_b32 s30, s24, 16
-; SI-NEXT: s_and_b32 s35, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s34, s23, 16
-; SI-NEXT: s_and_b32 s37, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s36, s22, 16
-; SI-NEXT: s_and_b32 s39, s21, 0xffff0000
-; SI-NEXT: s_lshl_b32 s38, s21, 16
-; SI-NEXT: s_and_b32 s49, s20, 0xffff0000
-; SI-NEXT: s_lshl_b32 s48, s20, 16
-; SI-NEXT: s_and_b32 s51, s19, 0xffff0000
-; SI-NEXT: s_lshl_b32 s50, s19, 16
-; SI-NEXT: s_and_b32 s53, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s52, s18, 16
-; SI-NEXT: s_and_b32 s55, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s54, s17, 16
-; SI-NEXT: s_and_b32 s65, s16, 0xffff0000
-; SI-NEXT: s_lshl_b32 s64, s16, 16
-; SI-NEXT: s_and_b32 s67, s5, 0xffff0000
-; SI-NEXT: s_lshl_b32 s66, s5, 16
-; SI-NEXT: s_and_b32 s69, s4, 0xffff0000
-; SI-NEXT: s_lshl_b32 s68, s4, 16
+; SI-NEXT: s_and_b32 s10, s7, 0xffff0000
+; SI-NEXT: s_lshl_b32 s11, s7, 16
+; SI-NEXT: s_and_b32 s12, s6, 0xffff0000
+; SI-NEXT: s_lshl_b32 s13, s6, 16
+; SI-NEXT: s_and_b32 s14, s15, 0xffff0000
+; SI-NEXT: s_lshl_b32 s15, s15, 16
+; SI-NEXT: s_and_b32 s40, s41, 0xffff0000
+; SI-NEXT: s_lshl_b32 s41, s41, 16
+; SI-NEXT: s_and_b32 s42, s43, 0xffff0000
+; SI-NEXT: s_lshl_b32 s43, s43, 16
+; SI-NEXT: s_and_b32 s44, s45, 0xffff0000
+; SI-NEXT: s_lshl_b32 s45, s45, 16
+; SI-NEXT: s_and_b32 s46, s47, 0xffff0000
+; SI-NEXT: s_lshl_b32 s47, s47, 16
+; SI-NEXT: s_and_b32 s56, s57, 0xffff0000
+; SI-NEXT: s_lshl_b32 s57, s57, 16
+; SI-NEXT: s_and_b32 s58, s59, 0xffff0000
+; SI-NEXT: s_lshl_b32 s59, s59, 16
+; SI-NEXT: s_and_b32 s60, s61, 0xffff0000
+; SI-NEXT: s_lshl_b32 s61, s61, 16
+; SI-NEXT: s_and_b32 s62, s63, 0xffff0000
+; SI-NEXT: s_lshl_b32 s63, s63, 16
+; SI-NEXT: s_and_b32 s72, s73, 0xffff0000
+; SI-NEXT: s_lshl_b32 s73, s73, 16
+; SI-NEXT: s_and_b32 s74, s75, 0xffff0000
+; SI-NEXT: s_lshl_b32 s75, s75, 16
+; SI-NEXT: s_and_b32 s76, s77, 0xffff0000
+; SI-NEXT: s_lshl_b32 s77, s77, 16
+; SI-NEXT: s_and_b32 s78, s29, 0xffff0000
+; SI-NEXT: s_lshl_b32 s79, s29, 16
+; SI-NEXT: s_and_b32 s88, s28, 0xffff0000
+; SI-NEXT: s_lshl_b32 s89, s28, 16
+; SI-NEXT: s_and_b32 s90, s27, 0xffff0000
+; SI-NEXT: s_lshl_b32 s91, s27, 16
+; SI-NEXT: s_and_b32 s92, s26, 0xffff0000
+; SI-NEXT: s_lshl_b32 s93, s26, 16
+; SI-NEXT: s_and_b32 s94, s25, 0xffff0000
+; SI-NEXT: s_lshl_b32 s95, s25, 16
+; SI-NEXT: s_and_b32 s30, s24, 0xffff0000
+; SI-NEXT: s_lshl_b32 s31, s24, 16
+; SI-NEXT: s_and_b32 s34, s23, 0xffff0000
+; SI-NEXT: s_lshl_b32 s35, s23, 16
+; SI-NEXT: s_and_b32 s36, s22, 0xffff0000
+; SI-NEXT: s_lshl_b32 s37, s22, 16
+; SI-NEXT: s_and_b32 s38, s21, 0xffff0000
+; SI-NEXT: s_lshl_b32 s39, s21, 16
+; SI-NEXT: s_and_b32 s48, s20, 0xffff0000
+; SI-NEXT: s_lshl_b32 s49, s20, 16
+; SI-NEXT: s_and_b32 s50, s19, 0xffff0000
+; SI-NEXT: s_lshl_b32 s51, s19, 16
+; SI-NEXT: s_and_b32 s52, s18, 0xffff0000
+; SI-NEXT: s_lshl_b32 s53, s18, 16
+; SI-NEXT: s_and_b32 s54, s17, 0xffff0000
+; SI-NEXT: s_lshl_b32 s55, s17, 16
+; SI-NEXT: s_and_b32 s64, s16, 0xffff0000
+; SI-NEXT: s_lshl_b32 s65, s16, 16
+; SI-NEXT: s_and_b32 s66, s5, 0xffff0000
+; SI-NEXT: s_lshl_b32 s67, s5, 16
+; SI-NEXT: s_and_b32 s68, s4, 0xffff0000
+; SI-NEXT: s_lshl_b32 s69, s4, 16
; SI-NEXT: v_writelane_b32 v21, s8, 3
; SI-NEXT: .LBB61_3: ; %end
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT: v_readlane_b32 s4, v21, 2
+; SI-NEXT: v_readlane_b32 s4, v21, 3
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT: v_readlane_b32 s4, v21, 3
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_readlane_b32 s4, v21, 2
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT: v_readlane_b32 s4, v21, 0
+; SI-NEXT: v_readlane_b32 s4, v21, 1
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT: v_readlane_b32 s4, v21, 1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_readlane_b32 s4, v21, 0
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: v_readlane_b32 s99, v20, 35
@@ -95607,66 +96164,66 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a
; SI-NEXT: .LBB61_4:
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; kill: killed $sgpr4
-; SI-NEXT: ; implicit-def: $sgpr68
; SI-NEXT: ; implicit-def: $sgpr69
-; SI-NEXT: ; implicit-def: $sgpr66
+; SI-NEXT: ; implicit-def: $sgpr68
; SI-NEXT: ; implicit-def: $sgpr67
-; SI-NEXT: ; implicit-def: $sgpr64
+; SI-NEXT: ; implicit-def: $sgpr66
; SI-NEXT: ; implicit-def: $sgpr65
-; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: ; implicit-def: $sgpr64
; SI-NEXT: ; implicit-def: $sgpr55
-; SI-NEXT: ; implicit-def: $sgpr52
+; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; implicit-def: $sgpr53
-; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: ; implicit-def: $sgpr52
; SI-NEXT: ; implicit-def: $sgpr51
-; SI-NEXT: ; implicit-def: $sgpr48
+; SI-NEXT: ; implicit-def: $sgpr50
; SI-NEXT: ; implicit-def: $sgpr49
-; SI-NEXT: ; implicit-def: $sgpr38
+; SI-NEXT: ; implicit-def: $sgpr48
; SI-NEXT: ; implicit-def: $sgpr39
-; SI-NEXT: ; implicit-def: $sgpr36
+; SI-NEXT: ; implicit-def: $sgpr38
; SI-NEXT: ; implicit-def: $sgpr37
-; SI-NEXT: ; implicit-def: $sgpr34
+; SI-NEXT: ; implicit-def: $sgpr36
; SI-NEXT: ; implicit-def: $sgpr35
-; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; implicit-def: $sgpr34
; SI-NEXT: ; implicit-def: $sgpr31
-; SI-NEXT: ; implicit-def: $sgpr94
+; SI-NEXT: ; implicit-def: $sgpr30
; SI-NEXT: ; implicit-def: $sgpr95
-; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr94
; SI-NEXT: ; implicit-def: $sgpr93
-; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr92
; SI-NEXT: ; implicit-def: $sgpr91
-; SI-NEXT: ; implicit-def: $sgpr88
+; SI-NEXT: ; implicit-def: $sgpr90
; SI-NEXT: ; implicit-def: $sgpr89
-; SI-NEXT: ; implicit-def: $sgpr78
+; SI-NEXT: ; implicit-def: $sgpr88
; SI-NEXT: ; implicit-def: $sgpr79
-; SI-NEXT: ; implicit-def: $sgpr76
+; SI-NEXT: ; implicit-def: $sgpr78
; SI-NEXT: ; implicit-def: $sgpr77
-; SI-NEXT: ; implicit-def: $sgpr74
+; SI-NEXT: ; implicit-def: $sgpr76
; SI-NEXT: ; implicit-def: $sgpr75
-; SI-NEXT: ; implicit-def: $sgpr72
+; SI-NEXT: ; implicit-def: $sgpr74
; SI-NEXT: ; implicit-def: $sgpr73
-; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr72
; SI-NEXT: ; implicit-def: $sgpr63
-; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: ; implicit-def: $sgpr62
; SI-NEXT: ; implicit-def: $sgpr61
-; SI-NEXT: ; implicit-def: $sgpr58
+; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; implicit-def: $sgpr59
-; SI-NEXT: ; implicit-def: $sgpr56
+; SI-NEXT: ; implicit-def: $sgpr58
; SI-NEXT: ; implicit-def: $sgpr57
-; SI-NEXT: ; implicit-def: $sgpr46
+; SI-NEXT: ; implicit-def: $sgpr56
; SI-NEXT: ; implicit-def: $sgpr47
-; SI-NEXT: ; implicit-def: $sgpr44
+; SI-NEXT: ; implicit-def: $sgpr46
; SI-NEXT: ; implicit-def: $sgpr45
-; SI-NEXT: ; implicit-def: $sgpr42
+; SI-NEXT: ; implicit-def: $sgpr44
; SI-NEXT: ; implicit-def: $sgpr43
-; SI-NEXT: ; implicit-def: $sgpr40
+; SI-NEXT: ; implicit-def: $sgpr42
; SI-NEXT: ; implicit-def: $sgpr41
-; SI-NEXT: ; implicit-def: $sgpr14
+; SI-NEXT: ; implicit-def: $sgpr40
; SI-NEXT: ; implicit-def: $sgpr15
-; SI-NEXT: ; implicit-def: $sgpr12
+; SI-NEXT: ; implicit-def: $sgpr14
; SI-NEXT: ; implicit-def: $sgpr13
-; SI-NEXT: ; implicit-def: $sgpr10
+; SI-NEXT: ; implicit-def: $sgpr12
; SI-NEXT: ; implicit-def: $sgpr11
+; SI-NEXT: ; implicit-def: $sgpr10
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
@@ -95945,213 +96502,224 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v42
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v45
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v46
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v47
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v6
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v5
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB62_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
@@ -96177,132 +96745,171 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16
-; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16
-; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16
-; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16
-; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16
-; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16
-; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16
-; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16
-; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16
-; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; kill: killed $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16
-; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v54
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v50
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v40
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v52
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
@@ -96348,263 +96955,317 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: .LBB62_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB62_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v40
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: .LBB62_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
@@ -96638,12 +97299,12 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB62_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -96655,14 +97316,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -96673,14 +97334,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v14, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -96691,14 +97352,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v13, v13, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -96709,14 +97370,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v12, v12, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -96727,14 +97388,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -96745,14 +97406,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v10, v10, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -96763,14 +97424,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v9, v9, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -96781,14 +97442,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v8, v8, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -96799,14 +97460,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -96817,14 +97478,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -96835,14 +97496,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -96853,14 +97514,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -96871,14 +97532,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -96889,14 +97550,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -96907,14 +97568,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -96925,15 +97586,15 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
@@ -96944,14 +97605,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31
; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
@@ -96962,14 +97623,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
@@ -96980,14 +97641,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29
; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v28
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
@@ -96998,14 +97659,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28
; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v27
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
@@ -97016,14 +97677,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27
; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v26
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
@@ -97034,14 +97695,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26
; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v25
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
@@ -97052,14 +97713,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25
; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
@@ -97070,14 +97731,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24
; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v23
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
@@ -97088,14 +97749,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23
; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v22
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
@@ -97106,14 +97767,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22
; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v22, v22, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v21
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
@@ -97124,14 +97785,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21
; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v21, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
@@ -97142,14 +97803,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20
; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v20, v20, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v19
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
@@ -97160,14 +97821,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19
; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v19, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v18
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
@@ -97178,14 +97839,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v18, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v17
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
@@ -97196,14 +97857,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v17, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v16
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
@@ -97214,8 +97875,8 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB62_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -98847,6 +99508,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-LABEL: bitcast_v64bf16_to_v16i64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
@@ -98863,533 +99525,621 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32
; SI-NEXT: s_waitcnt expcnt(6)
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36
; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40
; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v6, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v26
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; SI-NEXT: v_mov_b32_e32 v39, v10
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8
-; SI-NEXT: v_mov_b32_e32 v38, v12
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30
-; SI-NEXT: v_mov_b32_e32 v37, v14
-; SI-NEXT: v_mov_b32_e32 v14, v11
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29
-; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47
-; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB63_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_mov_b32_e32 v59, v2
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36
-; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16
-; SI-NEXT: v_mov_b32_e32 v57, v11
-; SI-NEXT: v_mov_b32_e32 v47, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16
-; SI-NEXT: v_mov_b32_e32 v33, v14
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_mov_b32_e32 v62, v38
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v50
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v48
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53
+; SI-NEXT: v_mov_b32_e32 v46, v38
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29
+; SI-NEXT: v_mov_b32_e32 v38, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v51
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v28
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_mov_b32_e32 v44, v37
+; SI-NEXT: v_mov_b32_e32 v37, v20
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v24
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v22
+; SI-NEXT: v_mov_b32_e32 v22, v26
+; SI-NEXT: v_mov_b32_e32 v58, v28
+; SI-NEXT: v_mov_b32_e32 v43, v23
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v61, v29
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v52, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32
+; SI-NEXT: v_mov_b32_e32 v47, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35
+; SI-NEXT: v_mov_b32_e32 v42, v49
+; SI-NEXT: v_mov_b32_e32 v45, v39
+; SI-NEXT: v_mov_b32_e32 v56, v48
+; SI-NEXT: v_mov_b32_e32 v57, v41
+; SI-NEXT: v_mov_b32_e32 v60, v40
+; SI-NEXT: v_mov_b32_e32 v59, v55
+; SI-NEXT: v_mov_b32_e32 v63, v54
+; SI-NEXT: v_mov_b32_e32 v36, v53
+; SI-NEXT: v_mov_b32_e32 v62, v51
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT: v_mov_b32_e32 v61, v53
-; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16
-; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16
-; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16
-; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16
-; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16
-; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16
-; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16
-; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16
-; SI-NEXT: v_mov_b32_e32 v50, v49
-; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16
-; SI-NEXT: v_mov_b32_e32 v48, v37
-; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16
-; SI-NEXT: v_mov_b32_e32 v37, v34
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mov_b32_e32 v35, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mov_b32_e32 v43, v8
-; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v42, v9
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
-; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v58, v11
-; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v56, v11
-; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v46, v12
-; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v63, v14
-; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v44, v14
-; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v36, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v53, v38
-; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16
-; SI-NEXT: v_mov_b32_e32 v38, v39
-; SI-NEXT: v_mov_b32_e32 v39, v41
-; SI-NEXT: v_mov_b32_e32 v41, v40
-; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: v_mov_b32_e32 v55, v54
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v38
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: s_cbranch_execnz .LBB63_3
; SI-NEXT: .LBB63_2: ; %cmp.true
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v57
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v58
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v47
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: .LBB63_3: ; %end
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -99407,41 +100157,28 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB63_4:
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v61, v53
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v59, v2
-; SI-NEXT: v_mov_b32_e32 v57, v11
-; SI-NEXT: v_mov_b32_e32 v47, v10
-; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_mov_b32_e32 v33, v14
-; SI-NEXT: v_mov_b32_e32 v62, v38
-; SI-NEXT: v_mov_b32_e32 v38, v39
-; SI-NEXT: v_mov_b32_e32 v39, v41
-; SI-NEXT: v_mov_b32_e32 v41, v40
-; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: v_mov_b32_e32 v55, v54
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v49
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_mov_b32_e32 v48, v37
-; SI-NEXT: v_mov_b32_e32 v37, v34
+; SI-NEXT: v_mov_b32_e32 v52, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
+; SI-NEXT: v_mov_b32_e32 v42, v49
+; SI-NEXT: v_mov_b32_e32 v44, v37
+; SI-NEXT: v_mov_b32_e32 v45, v39
+; SI-NEXT: v_mov_b32_e32 v46, v38
+; SI-NEXT: v_mov_b32_e32 v56, v48
+; SI-NEXT: v_mov_b32_e32 v57, v41
+; SI-NEXT: v_mov_b32_e32 v60, v40
+; SI-NEXT: v_mov_b32_e32 v59, v55
+; SI-NEXT: v_mov_b32_e32 v63, v54
+; SI-NEXT: v_mov_b32_e32 v36, v53
+; SI-NEXT: v_mov_b32_e32 v61, v29
+; SI-NEXT: v_mov_b32_e32 v62, v51
+; SI-NEXT: v_mov_b32_e32 v58, v28
+; SI-NEXT: v_mov_b32_e32 v47, v31
+; SI-NEXT: v_mov_b32_e32 v37, v20
+; SI-NEXT: v_mov_b32_e32 v33, v22
+; SI-NEXT: v_mov_b32_e32 v43, v23
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_branch .LBB63_2
;
@@ -99486,12 +100223,12 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB63_3
; VI-NEXT: .LBB63_2: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -99502,14 +100239,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -99520,14 +100257,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -99538,14 +100275,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -99556,14 +100293,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -99574,14 +100311,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -99592,14 +100329,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -99610,14 +100347,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -99628,14 +100365,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -99646,14 +100383,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -99664,14 +100401,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -99682,14 +100419,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -99700,14 +100437,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -99718,14 +100455,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -99736,14 +100473,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -99754,14 +100491,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -99772,14 +100509,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v31
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
@@ -99790,14 +100527,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31
; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v31, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v30
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
@@ -99808,14 +100545,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v30, v30, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v29
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
@@ -99826,14 +100563,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29
; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v29, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v28
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
@@ -99844,14 +100581,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28
; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v28, v28, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v27
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
@@ -99862,14 +100599,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27
; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v27, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v26
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
@@ -99880,14 +100617,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26
; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v26, v26, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v25
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
@@ -99898,14 +100635,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25
; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v25, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
@@ -99916,14 +100653,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24
; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v24, v24, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v23
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
@@ -99934,14 +100671,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23
; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v23, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v22
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
@@ -99952,14 +100689,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22
; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v22, v22, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
@@ -99970,14 +100707,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21
; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v21, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
@@ -99988,14 +100725,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20
; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v20, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
@@ -100006,14 +100743,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19
; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v19, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
@@ -100024,14 +100761,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v32, v32, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
@@ -100042,14 +100779,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
@@ -100060,8 +100797,8 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB63_3: ; %end
; VI-NEXT: v_mov_b32_e32 v18, v32
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -129053,34 +129790,34 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; kill: killed $vgpr35
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -129090,119 +129827,119 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB76_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v32
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v14
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v13
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13
-; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12
-; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11
-; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10
-; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v10
-; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v9
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9
-; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8
-; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v7
-; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v6
-; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6
-; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5
-; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4
-; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3
-; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2
-; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v15
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v14
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v14
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v13
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v13
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v12
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v12
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v11
+; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11
+; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v10
+; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v10
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v9
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9
+; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v8
+; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v7
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v7
+; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v6
+; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6
+; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v5
+; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5
+; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v4
+; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4
+; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v3
+; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr5
@@ -129226,63 +129963,61 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v32
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
@@ -129292,170 +130027,186 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v14
-; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v13
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13
-; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12
-; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11
-; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10
-; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v10
-; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v9
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9
-; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8
-; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v7
-; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v6
-; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6
-; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5
-; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4
-; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3
-; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2
-; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v15
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v14
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v14
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v13
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v13
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v12
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v12
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v11
+; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11
+; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v10
+; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v10
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v9
+; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9
+; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v8
+; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v7
+; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v7
+; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v6
+; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6
+; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v5
+; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5
+; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v4
+; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4
+; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v3
+; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: .LBB76_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v63
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v62
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129463,10 +130214,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129474,10 +130226,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129485,10 +130238,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129496,10 +130250,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129507,10 +130262,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129518,10 +130274,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129529,10 +130286,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129540,10 +130298,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129551,10 +130310,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129562,10 +130322,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129573,10 +130334,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129584,10 +130346,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129595,10 +130358,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129606,10 +130370,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129617,10 +130382,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -129628,10 +130394,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -129641,7 +130408,8 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
@@ -129934,94 +130702,93 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_writelane_b32 v62, s46, 3
; SI-NEXT: s_cbranch_execnz .LBB77_4
; SI-NEXT: .LBB77_2: ; %cmp.true
-; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0
+; SI-NEXT: v_add_f64 v[24:25], s[14:15], 1.0
; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0
; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20
-; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25
+; SI-NEXT: v_add_f64 v[40:41], s[24:25], 1.0
+; SI-NEXT: v_add_f64 v[52:53], s[26:27], 1.0
+; SI-NEXT: v_add_f64 v[48:49], s[28:29], 1.0
+; SI-NEXT: v_add_f64 v[36:37], s[44:45], 1.0
+; SI-NEXT: v_add_f64 v[32:33], s[42:43], 1.0
+; SI-NEXT: v_add_f64 v[28:29], s[40:41], 1.0
+; SI-NEXT: v_add_f64 v[20:21], s[12:13], 1.0
+; SI-NEXT: v_add_f64 v[16:17], s[10:11], 1.0
+; SI-NEXT: v_add_f64 v[12:13], s[8:9], 1.0
+; SI-NEXT: v_add_f64 v[8:9], s[4:5], 1.0
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42
-; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42
-; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v41
-; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41
-; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v8
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v12
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v16
+; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v16
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v20
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v20
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v29
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v37
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37
+; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v49
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49
+; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v53
+; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v53
+; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v41
+; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v41
+; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_f64 v[2:3], s[20:21], 1.0
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3
+; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v3
+; SI-NEXT: v_add_f64 v[60:61], s[18:19], 1.0
; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0
; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v1
; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1
+; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v61
+; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v61
; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v4
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_mov_b32_e32 v4, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f64 v[51:52], s[26:27], 1.0
-; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0
-; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0
-; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0
-; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0
-; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0
-; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0
-; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0
-; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0
-; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7
-; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12
-; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16
-; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v24
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v32
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
-; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v36
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36
-; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
-; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v50
-; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49
-; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v49
-; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52
-; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52
-; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51
-; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v13
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v17
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v17
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v21
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21
+; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v28
+; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v32
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v36
+; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v48
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v52
+; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v40
+; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40
; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v2
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2
-; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v60
-; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v60
-; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v59
-; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59
+; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v60
+; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_branch .LBB77_5
@@ -130096,236 +130863,243 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: ; kill: killed $sgpr46
; SI-NEXT: s_branch .LBB77_2
; SI-NEXT: .LBB77_4:
-; SI-NEXT: v_mov_b32_e32 v1, s71
+; SI-NEXT: v_mov_b32_e32 v1, s67
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s69
+; SI-NEXT: v_mov_b32_e32 v1, s66
; SI-NEXT: v_readlane_b32 s4, v62, 0
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s68
+; SI-NEXT: v_mov_b32_e32 v1, s65
; SI-NEXT: v_mov_b32_e32 v61, s4
; SI-NEXT: v_readlane_b32 s4, v62, 1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: v_readlane_b32 s4, v62, 2
; SI-NEXT: v_mov_b32_e32 v2, s4
; SI-NEXT: v_readlane_b32 s4, v62, 3
-; SI-NEXT: v_mov_b32_e32 v5, s59
-; SI-NEXT: v_mov_b32_e32 v4, s58
-; SI-NEXT: v_mov_b32_e32 v9, s57
-; SI-NEXT: v_mov_b32_e32 v6, s56
-; SI-NEXT: v_mov_b32_e32 v13, s99
-; SI-NEXT: v_mov_b32_e32 v10, s98
-; SI-NEXT: v_mov_b32_e32 v17, s97
-; SI-NEXT: v_mov_b32_e32 v14, s96
-; SI-NEXT: v_mov_b32_e32 v21, s87
-; SI-NEXT: v_mov_b32_e32 v18, s86
-; SI-NEXT: v_mov_b32_e32 v25, s85
-; SI-NEXT: v_mov_b32_e32 v22, s84
-; SI-NEXT: v_mov_b32_e32 v29, s83
-; SI-NEXT: v_mov_b32_e32 v26, s82
-; SI-NEXT: v_mov_b32_e32 v33, s81
-; SI-NEXT: v_mov_b32_e32 v30, s80
-; SI-NEXT: v_mov_b32_e32 v34, s70
-; SI-NEXT: v_mov_b32_e32 v8, s67
-; SI-NEXT: v_mov_b32_e32 v7, s66
-; SI-NEXT: v_mov_b32_e32 v24, s65
-; SI-NEXT: v_mov_b32_e32 v23, s64
-; SI-NEXT: v_mov_b32_e32 v16, s55
-; SI-NEXT: v_mov_b32_e32 v15, s54
-; SI-NEXT: v_mov_b32_e32 v28, s53
-; SI-NEXT: v_mov_b32_e32 v27, s52
-; SI-NEXT: v_mov_b32_e32 v12, s51
-; SI-NEXT: v_mov_b32_e32 v11, s50
-; SI-NEXT: v_mov_b32_e32 v32, s49
-; SI-NEXT: v_mov_b32_e32 v31, s48
-; SI-NEXT: v_mov_b32_e32 v20, s39
-; SI-NEXT: v_mov_b32_e32 v19, s38
-; SI-NEXT: v_mov_b32_e32 v36, s37
-; SI-NEXT: v_mov_b32_e32 v35, s36
-; SI-NEXT: v_mov_b32_e32 v38, s35
-; SI-NEXT: v_mov_b32_e32 v37, s34
-; SI-NEXT: v_mov_b32_e32 v48, s31
-; SI-NEXT: v_mov_b32_e32 v39, s30
-; SI-NEXT: v_mov_b32_e32 v50, s95
-; SI-NEXT: v_mov_b32_e32 v49, s94
-; SI-NEXT: v_mov_b32_e32 v52, s93
-; SI-NEXT: v_mov_b32_e32 v51, s92
-; SI-NEXT: v_mov_b32_e32 v54, s91
-; SI-NEXT: v_mov_b32_e32 v53, s90
-; SI-NEXT: v_mov_b32_e32 v40, s89
-; SI-NEXT: v_mov_b32_e32 v55, s88
-; SI-NEXT: v_mov_b32_e32 v42, s79
-; SI-NEXT: v_mov_b32_e32 v41, s78
+; SI-NEXT: v_mov_b32_e32 v4, s59
+; SI-NEXT: v_mov_b32_e32 v5, s58
+; SI-NEXT: v_mov_b32_e32 v6, s57
+; SI-NEXT: v_mov_b32_e32 v7, s56
+; SI-NEXT: v_mov_b32_e32 v10, s99
+; SI-NEXT: v_mov_b32_e32 v11, s98
+; SI-NEXT: v_mov_b32_e32 v14, s97
+; SI-NEXT: v_mov_b32_e32 v15, s96
+; SI-NEXT: v_mov_b32_e32 v18, s87
+; SI-NEXT: v_mov_b32_e32 v19, s86
+; SI-NEXT: v_mov_b32_e32 v22, s85
+; SI-NEXT: v_mov_b32_e32 v23, s84
+; SI-NEXT: v_mov_b32_e32 v26, s83
+; SI-NEXT: v_mov_b32_e32 v27, s82
+; SI-NEXT: v_mov_b32_e32 v30, s81
+; SI-NEXT: v_mov_b32_e32 v31, s80
+; SI-NEXT: v_mov_b32_e32 v34, s71
+; SI-NEXT: v_mov_b32_e32 v35, s70
+; SI-NEXT: v_mov_b32_e32 v38, s69
+; SI-NEXT: v_mov_b32_e32 v39, s68
+; SI-NEXT: v_mov_b32_e32 v24, s64
+; SI-NEXT: v_mov_b32_e32 v25, s55
+; SI-NEXT: v_mov_b32_e32 v16, s54
+; SI-NEXT: v_mov_b32_e32 v17, s53
+; SI-NEXT: v_mov_b32_e32 v28, s52
+; SI-NEXT: v_mov_b32_e32 v29, s51
+; SI-NEXT: v_mov_b32_e32 v12, s50
+; SI-NEXT: v_mov_b32_e32 v13, s49
+; SI-NEXT: v_mov_b32_e32 v32, s48
+; SI-NEXT: v_mov_b32_e32 v33, s39
+; SI-NEXT: v_mov_b32_e32 v20, s38
+; SI-NEXT: v_mov_b32_e32 v21, s37
+; SI-NEXT: v_mov_b32_e32 v36, s36
+; SI-NEXT: v_mov_b32_e32 v37, s35
+; SI-NEXT: v_mov_b32_e32 v8, s34
+; SI-NEXT: v_mov_b32_e32 v9, s31
+; SI-NEXT: v_mov_b32_e32 v48, s30
+; SI-NEXT: v_mov_b32_e32 v49, s95
+; SI-NEXT: v_mov_b32_e32 v50, s94
+; SI-NEXT: v_mov_b32_e32 v51, s93
+; SI-NEXT: v_mov_b32_e32 v52, s92
+; SI-NEXT: v_mov_b32_e32 v53, s91
+; SI-NEXT: v_mov_b32_e32 v54, s90
+; SI-NEXT: v_mov_b32_e32 v55, s89
+; SI-NEXT: v_mov_b32_e32 v40, s88
+; SI-NEXT: v_mov_b32_e32 v41, s79
+; SI-NEXT: v_mov_b32_e32 v42, s78
; SI-NEXT: v_mov_b32_e32 v43, s77
; SI-NEXT: v_mov_b32_e32 v44, s76
-; SI-NEXT: v_mov_b32_e32 v46, s75
-; SI-NEXT: v_mov_b32_e32 v45, s74
+; SI-NEXT: v_mov_b32_e32 v45, s75
+; SI-NEXT: v_mov_b32_e32 v46, s74
; SI-NEXT: v_mov_b32_e32 v47, s73
; SI-NEXT: v_mov_b32_e32 v56, s72
-; SI-NEXT: v_mov_b32_e32 v58, s63
-; SI-NEXT: v_mov_b32_e32 v57, s62
-; SI-NEXT: v_mov_b32_e32 v60, s61
-; SI-NEXT: v_mov_b32_e32 v59, s60
+; SI-NEXT: v_mov_b32_e32 v57, s63
+; SI-NEXT: v_mov_b32_e32 v58, s62
+; SI-NEXT: v_mov_b32_e32 v59, s61
+; SI-NEXT: v_mov_b32_e32 v60, s60
; SI-NEXT: v_mov_b32_e32 v3, s4
; SI-NEXT: .LBB77_5: ; %end
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; SI-NEXT: v_readlane_b32 s99, v63, 35
; SI-NEXT: v_readlane_b32 s98, v63, 34
; SI-NEXT: v_readlane_b32 s97, v63, 33
@@ -130362,77 +131136,102 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
; SI-NEXT: v_readlane_b32 s34, v63, 2
; SI-NEXT: v_readlane_b32 s31, v63, 1
; SI-NEXT: v_readlane_b32 s30, v63, 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
+; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
@@ -130672,213 +131471,224 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v42
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v45
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v46
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v47
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v6
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v5
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB78_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
@@ -130904,132 +131714,173 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37
; SI-NEXT: ; kill: killed $vgpr33
; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16
-; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16
-; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16
-; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16
-; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16
-; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16
-; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16
-; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16
-; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16
-; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; kill: killed $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16
-; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v54
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v50
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v40
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v52
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
@@ -131075,263 +131926,315 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; kill: killed $vgpr32
; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: .LBB78_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB78_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v40
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: .LBB78_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
@@ -131365,12 +132268,12 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB78_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -131382,14 +132285,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -131400,14 +132303,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v14, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -131418,14 +132321,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v13, v13, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -131436,14 +132339,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v12, v12, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -131454,14 +132357,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -131472,14 +132375,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v10, v10, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -131490,14 +132393,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v9, v9, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -131508,14 +132411,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v8, v8, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -131526,14 +132429,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -131544,14 +132447,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -131562,14 +132465,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -131580,14 +132483,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -131598,14 +132501,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -131616,14 +132519,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -131634,14 +132537,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -131652,15 +132555,15 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
@@ -131671,14 +132574,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31
; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
@@ -131689,14 +132592,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
@@ -131707,14 +132610,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29
; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v28
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
@@ -131725,14 +132628,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28
; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v27
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
@@ -131743,14 +132646,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27
; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v26
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
@@ -131761,14 +132664,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26
; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v25
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
@@ -131779,14 +132682,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25
; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v24
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
@@ -131797,14 +132700,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24
; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v23
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
@@ -131815,14 +132718,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23
; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v22
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
@@ -131833,14 +132736,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22
; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v22, v22, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v21
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
@@ -131851,14 +132754,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21
; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v21, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
@@ -131869,14 +132772,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20
; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v20, v20, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v19
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
@@ -131887,14 +132790,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19
; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v19, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v18
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
@@ -131905,14 +132808,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v18, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v17
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
@@ -131923,14 +132826,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16
-; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v17, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v16
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
-; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
@@ -131941,8 +132844,8 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB78_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -133574,6 +134477,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-LABEL: bitcast_v64bf16_to_v16f64_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v52, v28
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
@@ -133590,533 +134494,621 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32
; SI-NEXT: s_waitcnt expcnt(6)
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36
; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40
; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v6, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v26
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; SI-NEXT: v_mov_b32_e32 v39, v10
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8
-; SI-NEXT: v_mov_b32_e32 v38, v12
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30
-; SI-NEXT: v_mov_b32_e32 v37, v14
-; SI-NEXT: v_mov_b32_e32 v14, v11
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29
-; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47
-; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB79_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_mov_b32_e32 v59, v2
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36
-; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16
-; SI-NEXT: v_mov_b32_e32 v57, v11
-; SI-NEXT: v_mov_b32_e32 v47, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16
-; SI-NEXT: v_mov_b32_e32 v33, v14
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_mov_b32_e32 v62, v38
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v50
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v48
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53
+; SI-NEXT: v_mov_b32_e32 v46, v38
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29
+; SI-NEXT: v_mov_b32_e32 v38, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v51
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v28
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_mov_b32_e32 v44, v37
+; SI-NEXT: v_mov_b32_e32 v37, v20
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v24
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v22
+; SI-NEXT: v_mov_b32_e32 v22, v26
+; SI-NEXT: v_mov_b32_e32 v58, v28
+; SI-NEXT: v_mov_b32_e32 v43, v23
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v61, v29
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v52, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32
+; SI-NEXT: v_mov_b32_e32 v47, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35
+; SI-NEXT: v_mov_b32_e32 v42, v49
+; SI-NEXT: v_mov_b32_e32 v45, v39
+; SI-NEXT: v_mov_b32_e32 v56, v48
+; SI-NEXT: v_mov_b32_e32 v57, v41
+; SI-NEXT: v_mov_b32_e32 v60, v40
+; SI-NEXT: v_mov_b32_e32 v59, v55
+; SI-NEXT: v_mov_b32_e32 v63, v54
+; SI-NEXT: v_mov_b32_e32 v36, v53
+; SI-NEXT: v_mov_b32_e32 v62, v51
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT: v_mov_b32_e32 v61, v53
-; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16
-; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16
-; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16
-; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16
-; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16
-; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16
-; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16
-; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16
-; SI-NEXT: v_mov_b32_e32 v50, v49
-; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16
-; SI-NEXT: v_mov_b32_e32 v48, v37
-; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16
-; SI-NEXT: v_mov_b32_e32 v37, v34
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mov_b32_e32 v35, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mov_b32_e32 v43, v8
-; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v42, v9
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
-; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v60, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v58, v11
-; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v56, v11
-; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v46, v12
-; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v63, v14
-; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v44, v14
-; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v36, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v53, v38
-; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16
-; SI-NEXT: v_mov_b32_e32 v38, v39
-; SI-NEXT: v_mov_b32_e32 v39, v41
-; SI-NEXT: v_mov_b32_e32 v41, v40
-; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: v_mov_b32_e32 v55, v54
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v38
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: s_cbranch_execnz .LBB79_3
; SI-NEXT: .LBB79_2: ; %cmp.true
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v57
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v58
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v47
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16
-; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: .LBB79_3: ; %end
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -134134,41 +135126,28 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB79_4:
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v61, v53
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v59, v2
-; SI-NEXT: v_mov_b32_e32 v57, v11
-; SI-NEXT: v_mov_b32_e32 v47, v10
-; SI-NEXT: v_mov_b32_e32 v45, v12
-; SI-NEXT: v_mov_b32_e32 v33, v14
-; SI-NEXT: v_mov_b32_e32 v62, v38
-; SI-NEXT: v_mov_b32_e32 v38, v39
-; SI-NEXT: v_mov_b32_e32 v39, v41
-; SI-NEXT: v_mov_b32_e32 v41, v40
-; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: v_mov_b32_e32 v55, v54
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v49
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_mov_b32_e32 v48, v37
-; SI-NEXT: v_mov_b32_e32 v37, v34
+; SI-NEXT: v_mov_b32_e32 v52, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
+; SI-NEXT: v_mov_b32_e32 v42, v49
+; SI-NEXT: v_mov_b32_e32 v44, v37
+; SI-NEXT: v_mov_b32_e32 v45, v39
+; SI-NEXT: v_mov_b32_e32 v46, v38
+; SI-NEXT: v_mov_b32_e32 v56, v48
+; SI-NEXT: v_mov_b32_e32 v57, v41
+; SI-NEXT: v_mov_b32_e32 v60, v40
+; SI-NEXT: v_mov_b32_e32 v59, v55
+; SI-NEXT: v_mov_b32_e32 v63, v54
+; SI-NEXT: v_mov_b32_e32 v36, v53
+; SI-NEXT: v_mov_b32_e32 v61, v29
+; SI-NEXT: v_mov_b32_e32 v62, v51
+; SI-NEXT: v_mov_b32_e32 v58, v28
+; SI-NEXT: v_mov_b32_e32 v47, v31
+; SI-NEXT: v_mov_b32_e32 v37, v20
+; SI-NEXT: v_mov_b32_e32 v33, v22
+; SI-NEXT: v_mov_b32_e32 v43, v23
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; SI-NEXT: s_branch .LBB79_2
;
@@ -134213,12 +135192,12 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB79_3
; VI-NEXT: .LBB79_2: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -134229,14 +135208,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -134247,14 +135226,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -134265,14 +135244,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -134283,14 +135262,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -134301,14 +135280,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -134319,14 +135298,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -134337,14 +135316,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -134355,14 +135334,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -134373,14 +135352,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -134391,14 +135370,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -134409,14 +135388,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -134427,14 +135406,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -134445,14 +135424,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -134463,14 +135442,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -134481,14 +135460,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -134499,14 +135478,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v31
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
@@ -134517,14 +135496,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31
; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v31, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v30
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
@@ -134535,14 +135514,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v30, v30, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v29
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
@@ -134553,14 +135532,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29
; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v29, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v28
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
@@ -134571,14 +135550,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28
; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v28, v28, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v27
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
@@ -134589,14 +135568,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27
; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v27, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v26
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
@@ -134607,14 +135586,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26
; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v26, v26, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v25
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
@@ -134625,14 +135604,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25
; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v25, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
@@ -134643,14 +135622,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24
; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v24, v24, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v23
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
@@ -134661,14 +135640,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23
; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v23, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v22
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
@@ -134679,14 +135658,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22
; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v22, v22, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
@@ -134697,14 +135676,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21
; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v21, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
@@ -134715,14 +135694,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20
; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v20, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
@@ -134733,14 +135712,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19
; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v19, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
@@ -134751,14 +135730,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v32, v32, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
@@ -134769,14 +135748,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
@@ -134787,8 +135766,8 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB79_3: ; %end
; VI-NEXT: v_mov_b32_e32 v18, v32
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -143190,19 +144169,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
@@ -143251,14 +144227,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160
; SI-NEXT: s_waitcnt vmcnt(7)
@@ -143293,14 +144266,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v26
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v27
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29
; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -143317,78 +144291,75 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2
+; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v1
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v2
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240
-; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v1
+; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v1
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2
+; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v2
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3
+; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:272
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:296
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1
; SI-NEXT: s_waitcnt vmcnt(5)
@@ -143398,37 +144369,36 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:320
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:320
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:328
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v2
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:360
-; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:384
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v2
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v3
+; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v2
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v1
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -143436,429 +144406,58 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120
-; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:152
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:248
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:312
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:344
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:44
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; SI-NEXT: s_cbranch_execz .LBB88_2
-; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v56
-; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8
-; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v54
-; SI-NEXT: v_and_b32_e32 v13, 0xff, v60
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v12, v5, v9
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v12
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v16, v5, v9
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v16
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v20, v5, v9
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v20
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v22, v5, v9
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v24, v5, v9
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v28, v5, v9
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v29, v5, v9
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v1, v1, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -143936,730 +144535,1094 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT: s_cbranch_execz .LBB88_2
+; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v56
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_or_b32_e32 v4, v4, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v59
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v27
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v27, v5, v9
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27
-; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v6, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v6
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v6, v5, v6
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6
-; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v17, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v17
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v30
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v30, v5, v9
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30
-; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v40
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v10, v10, v9
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v49
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v57, v9
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: v_or_b32_e32 v21, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v19, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v53
-; SI-NEXT: v_or_b32_e32 v19, v5, v26
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19
-; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v31, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v15
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v7, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v58
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_or_b32_e32 v23, v17, v15
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v46
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_or_b32_e32 v18, v3, v15
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v63
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v59
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: v_or_b32_e32 v17, v4, v3
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v33, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v42
-; SI-NEXT: v_or_b32_e32 v7, v5, v38
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v21
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v51, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v8, v5
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v25
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v11, v41, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v47
-; SI-NEXT: v_or_b32_e32 v8, v5, v44
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v36
-; SI-NEXT: v_or_b32_e32 v14, v13, v61
-; SI-NEXT: v_and_b32_e32 v13, 0xff, v43
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_or_b32_e32 v5, v45, v5
-; SI-NEXT: v_or_b32_e32 v13, v62, v13
-; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v8
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: .LBB88_2: ; %Flow
-; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT: s_cbranch_execz .LBB88_4
-; SI-NEXT: ; %bb.3: ; %cmp.true
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v63
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_or_b32_e32 v2, v2, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v59
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v2
-; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v58
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v56
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: s_movk_i32 s6, 0x300
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v5
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v60
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v43
-; SI-NEXT: v_or_b32_e32 v4, v61, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
-; SI-NEXT: v_or_b32_e32 v5, v62, v5
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v40
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v54
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5
-; SI-NEXT: v_or_b32_e32 v9, v57, v9
-; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT: v_or_b32_e32 v5, v9, v5
-; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v47
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36
-; SI-NEXT: v_or_b32_e32 v9, v44, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9
-; SI-NEXT: v_or_b32_e32 v10, v45, v10
-; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT: v_or_b32_e32 v9, v10, v9
-; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v14
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; SI-NEXT: v_or_b32_e32 v8, v8, v10
-; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v25
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
-; SI-NEXT: v_or_b32_e32 v10, v41, v10
-; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT: v_or_b32_e32 v8, v10, v8
-; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v42
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v21
-; SI-NEXT: v_or_b32_e32 v10, v38, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10
-; SI-NEXT: v_or_b32_e32 v11, v51, v11
-; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v15
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT: v_or_b32_e32 v7, v7, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: s_mov_b32 s7, 0x3000000
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_or_b32_e32 v11, v33, v11
-; SI-NEXT: v_or_b32_e32 v11, v11, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v53
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12
-; SI-NEXT: v_or_b32_e32 v7, v26, v7
-; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7
-; SI-NEXT: v_or_b32_e32 v12, v31, v12
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT: v_or_b32_e32 v12, v12, v7
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_or_b32_e32 v7, v13, v7
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13
-; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_or_b32_e32 v13, v19, v13
-; SI-NEXT: v_or_b32_e32 v13, v13, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v30
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_or_b32_e32 v7, v14, v7
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_or_b32_e32 v14, v15, v14
-; SI-NEXT: v_or_b32_e32 v14, v14, v7
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v22, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15
-; SI-NEXT: v_or_b32_e32 v7, v15, v7
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15
-; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_or_b32_e32 v15, v16, v15
-; SI-NEXT: v_or_b32_e32 v15, v15, v7
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_or_b32_e32 v7, v16, v7
-; SI-NEXT: v_or_b32_e32 v16, v7, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_or_b32_e32 v7, v17, v7
-; SI-NEXT: v_or_b32_e32 v17, v7, v6
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_or_b32_e32 v26, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v26
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_or_b32_e32 v7, v18, v7
-; SI-NEXT: v_or_b32_e32 v18, v7, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_or_b32_e32 v7, v19, v7
-; SI-NEXT: v_or_b32_e32 v19, v7, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v29, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v1, v6, v1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_or_b32_e32 v20, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT: v_or_b32_e32 v1, v6, v1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_or_b32_e32 v21, v6, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v1, v6, v1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_or_b32_e32 v22, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT: v_or_b32_e32 v1, v6, v1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_or_b32_e32 v23, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
+; SI-NEXT: v_or_b32_e32 v30, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v5
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v11
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v1, v6, v1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_or_b32_e32 v24, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v5, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT: v_or_b32_e32 v1, v6, v1
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_or_b32_e32 v25, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v10
-; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13
-; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v16
-; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v19
-; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v22
-; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v25
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v25
+; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_or_b32_e32 v26, v6, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25
+; SI-NEXT: v_or_b32_e32 v11, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11
+; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v27, v6, v1
-; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v12
-; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v15
-; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v18
-; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v21
-; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v24
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v11
-; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v17
-; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v20
-; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v23
-; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v26
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; SI-NEXT: v_or_b32_e32 v23, v27, v23
-; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23
-; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v21
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
-; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT: v_or_b32_e32 v24, v25, v24
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24
-; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
-; SI-NEXT: v_and_b32_e32 v25, 0xff, v25
-; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_or_b32_e32 v25, v26, v25
-; SI-NEXT: v_or_b32_e32 v24, v25, v24
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
-; SI-NEXT: v_and_b32_e32 v25, 0xff, v25
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v16
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v13, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13
+; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v36
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v16, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v23, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v9
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v27, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v53
+; SI-NEXT: v_or_b32_e32 v8, v1, v8
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8
+; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v14, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v18
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v19
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v28, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v42
+; SI-NEXT: v_or_b32_e32 v9, v1, v39
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v31
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v51, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v24
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v20
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v41, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v47
+; SI-NEXT: v_or_b32_e32 v12, v1, v44
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v38
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v45, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v40
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v54
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v50
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v57, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v60
+; SI-NEXT: v_or_b32_e32 v14, v1, v61
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v43
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v2, v62, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v58
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v10, v1
+; SI-NEXT: v_and_b32_e32 v10, 0xff, v46
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_or_b32_e32 v15, v63, v10
+; SI-NEXT: v_or_b32_e32 v10, v3, v7
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v12
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: .LBB88_2: ; %Flow
+; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; SI-NEXT: s_cbranch_execz .LBB88_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v59
+; SI-NEXT: v_or_b32_e32 v1, v7, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
+; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v58
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v56
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: s_movk_i32 s6, 0x300
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
+; SI-NEXT: v_or_b32_e32 v3, v63, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v43
+; SI-NEXT: v_or_b32_e32 v3, v61, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3
+; SI-NEXT: v_or_b32_e32 v4, v62, v4
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v4, v3
+; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v40
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54
+; SI-NEXT: v_or_b32_e32 v4, v6, v4
+; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v50
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4
+; SI-NEXT: v_or_b32_e32 v6, v57, v6
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_or_b32_e32 v4, v6, v4
+; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v47
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38
+; SI-NEXT: v_or_b32_e32 v6, v44, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6
+; SI-NEXT: v_or_b32_e32 v7, v45, v7
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_or_b32_e32 v6, v7, v6
+; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v24
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v20
+; SI-NEXT: v_or_b32_e32 v7, v10, v7
+; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12
+; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7
+; SI-NEXT: v_or_b32_e32 v10, v41, v10
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: v_or_b32_e32 v7, v10, v7
+; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v42
+; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v31
+; SI-NEXT: v_or_b32_e32 v10, v39, v10
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10
+; SI-NEXT: v_or_b32_e32 v12, v51, v12
+; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT: v_or_b32_e32 v10, v12, v10
+; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v18
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; SI-NEXT: v_or_b32_e32 v9, v9, v12
+; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v19
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9
+; SI-NEXT: v_or_b32_e32 v12, v28, v12
+; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT: v_or_b32_e32 v9, v12, v9
+; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v53
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_or_b32_e32 v8, v8, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: s_mov_b32 s7, 0x3000000
+; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9
+; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7
+; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v14, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
+; SI-NEXT: v_or_b32_e32 v8, v14, v8
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v27, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v8
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: v_or_b32_e32 v8, v15, v8
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v23, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: v_or_b32_e32 v8, v17, v8
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v18, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v8
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v8, v16, v8
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v18, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
+; SI-NEXT: v_or_b32_e32 v8, v18, v8
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v19, v18
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v18, v18, v8
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v25
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v8, v19, v8
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xff, v19
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v20, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; SI-NEXT: v_or_b32_e32 v8, v20, v8
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20
+; SI-NEXT: v_and_b32_e32 v20, 0xff, v20
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v21, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v8
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v8, v13, v8
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v21, v13
+; SI-NEXT: v_or_b32_e32 v21, v13, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v11
+; SI-NEXT: v_or_b32_e32 v5, v8, v5
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v16
+; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v20
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v8, v11, v8
+; SI-NEXT: v_or_b32_e32 v22, v8, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v5, v8, v5
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v11, v8
+; SI-NEXT: v_or_b32_e32 v23, v8, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT: v_or_b32_e32 v5, v8, v5
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v11, v8
+; SI-NEXT: v_or_b32_e32 v24, v8, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_or_b32_e32 v5, v8, v5
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v11, v8
+; SI-NEXT: v_or_b32_e32 v25, v8, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT: v_or_b32_e32 v5, v8, v5
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v11, v8
+; SI-NEXT: v_or_b32_e32 v26, v8, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v15
+; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v19
+; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v23
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v27, v8, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_or_b32_e32 v28, v8, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1
+; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2
+; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v6
+; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v12
+; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v17
+; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v21
+; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v10
+; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v14
+; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v18
+; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v22
+; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v23, v28, v23
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23
+; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25
+; SI-NEXT: v_or_b32_e32 v24, v25, v24
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
+; SI-NEXT: v_and_b32_e32 v25, 0xff, v25
+; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v26, v25
+; SI-NEXT: v_or_b32_e32 v24, v25, v24
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25
+; SI-NEXT: v_and_b32_e32 v25, 0xff, v25
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v25, v26, v25
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
@@ -144710,7 +145673,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27
-; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v27
+; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v27
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_and_b32_e32 v28, 0xff, v28
@@ -144801,7 +145764,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v32
; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31
; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
@@ -144821,7 +145784,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
@@ -144831,387 +145794,435 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v19
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17
+; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v17
; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
-; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13
+; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v9
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
-; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v2
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
-; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
; SI-NEXT: .LBB88_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v55
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v52
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v49
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v48
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v37
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v35
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v34
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v30
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v22
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v33
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v32
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v29
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v26
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -149433,13 +150444,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304
; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT: s_mov_b32 s72, s21
+; SI-NEXT: s_mov_b32 s73, s28
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_writelane_b32 v43, s19, 0
; SI-NEXT: v_writelane_b32 v43, s18, 1
; SI-NEXT: v_writelane_b32 v43, s17, 2
; SI-NEXT: v_writelane_b32 v43, s16, 3
-; SI-NEXT: s_mov_b32 s60, s24
+; SI-NEXT: s_mov_b32 s79, s25
; SI-NEXT: v_writelane_b32 v41, s30, 0
; SI-NEXT: v_writelane_b32 v41, s31, 1
; SI-NEXT: v_writelane_b32 v41, s34, 2
@@ -149464,8 +150475,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s69, 21
; SI-NEXT: v_writelane_b32 v41, s70, 22
; SI-NEXT: v_writelane_b32 v41, s71, 23
-; SI-NEXT: s_mov_b32 s77, s28
-; SI-NEXT: s_mov_b32 s76, s27
+; SI-NEXT: s_mov_b32 s58, s29
+; SI-NEXT: s_mov_b32 s61, s27
+; SI-NEXT: s_mov_b32 s77, s26
+; SI-NEXT: s_mov_b32 s88, s23
; SI-NEXT: v_writelane_b32 v41, s80, 24
; SI-NEXT: v_writelane_b32 v41, s81, 25
; SI-NEXT: v_writelane_b32 v41, s82, 26
@@ -149478,7 +150491,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s97, 33
; SI-NEXT: v_writelane_b32 v41, s98, 34
; SI-NEXT: v_writelane_b32 v41, s99, 35
-; SI-NEXT: s_mov_b32 s79, s26
+; SI-NEXT: s_mov_b32 s63, s21
; SI-NEXT: v_readfirstlane_b32 s38, v20
; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
; SI-NEXT: v_readfirstlane_b32 s39, v19
@@ -149504,9 +150517,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s17, v2
; SI-NEXT: v_readfirstlane_b32 s18, v5
; SI-NEXT: v_readfirstlane_b32 s19, v6
-; SI-NEXT: v_readfirstlane_b32 s88, v4
-; SI-NEXT: v_readfirstlane_b32 s89, v3
-; SI-NEXT: v_readfirstlane_b32 s90, v9
+; SI-NEXT: v_readfirstlane_b32 s78, v4
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s6, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
@@ -149542,6 +150553,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v38
; SI-NEXT: v_writelane_b32 v43, s4, 10
+; SI-NEXT: v_readfirstlane_b32 s89, v3
+; SI-NEXT: v_readfirstlane_b32 s90, v9
; SI-NEXT: v_readfirstlane_b32 s91, v10
; SI-NEXT: v_readfirstlane_b32 s92, v8
; SI-NEXT: v_readfirstlane_b32 s93, v7
@@ -149578,39 +150591,38 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s75, v32
+; SI-NEXT: v_readfirstlane_b32 s56, v32
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s61, v33
+; SI-NEXT: v_readfirstlane_b32 s75, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT: v_writelane_b32 v43, s4, 16
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s43, v34
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s40, v35
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s4, v36
+; SI-NEXT: v_readfirstlane_b32 s42, v36
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s63, v37
+; SI-NEXT: v_readfirstlane_b32 s62, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204
-; SI-NEXT: v_writelane_b32 v43, s4, 17
+; SI-NEXT: v_writelane_b32 v43, s4, 16
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s59, v31
+; SI-NEXT: v_readfirstlane_b32 s74, v31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s42, v38
+; SI-NEXT: v_readfirstlane_b32 s60, v38
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s73, v39
+; SI-NEXT: v_readfirstlane_b32 s28, v39
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s21, v48
+; SI-NEXT: v_readfirstlane_b32 s57, v48
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s57, v49
+; SI-NEXT: v_readfirstlane_b32 s59, v49
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s13, v50
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s45, v51
+; SI-NEXT: v_readfirstlane_b32 s76, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192
@@ -149619,25 +150631,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s47, v32
+; SI-NEXT: v_readfirstlane_b32 s25, v32
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s24, v33
+; SI-NEXT: v_readfirstlane_b32 s46, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s78, v34
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_readfirstlane_b32 s4, v34
+; SI-NEXT: v_writelane_b32 v43, s4, 17
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
; SI-NEXT: v_writelane_b32 v43, s4, 18
+; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v36
; SI-NEXT: v_writelane_b32 v43, s4, 19
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: v_writelane_b32 v43, s4, 20
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 21
@@ -149688,20 +150703,20 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s4, v40
; SI-NEXT: v_writelane_b32 v43, s4, 33
; SI-NEXT: v_writelane_b32 v43, s22, 34
-; SI-NEXT: v_writelane_b32 v43, s23, 35
-; SI-NEXT: v_writelane_b32 v43, s72, 36
+; SI-NEXT: v_writelane_b32 v43, s88, 35
+; SI-NEXT: v_writelane_b32 v43, s63, 36
; SI-NEXT: v_writelane_b32 v43, s20, 37
-; SI-NEXT: v_writelane_b32 v43, s79, 38
-; SI-NEXT: v_writelane_b32 v43, s76, 39
-; SI-NEXT: v_writelane_b32 v43, s25, 40
-; SI-NEXT: v_writelane_b32 v43, s60, 41
-; SI-NEXT: v_writelane_b32 v43, s29, 42
-; SI-NEXT: v_writelane_b32 v43, s77, 43
+; SI-NEXT: v_writelane_b32 v43, s77, 38
+; SI-NEXT: v_writelane_b32 v43, s61, 39
+; SI-NEXT: v_writelane_b32 v43, s79, 40
+; SI-NEXT: v_writelane_b32 v43, s24, 41
+; SI-NEXT: v_writelane_b32 v43, s58, 42
+; SI-NEXT: v_writelane_b32 v43, s73, 43
; SI-NEXT: v_writelane_b32 v43, s16, 44
; SI-NEXT: v_writelane_b32 v43, s17, 45
; SI-NEXT: v_writelane_b32 v43, s18, 46
; SI-NEXT: v_writelane_b32 v43, s19, 47
-; SI-NEXT: v_writelane_b32 v43, s88, 48
+; SI-NEXT: v_writelane_b32 v43, s78, 48
; SI-NEXT: v_writelane_b32 v43, s89, 49
; SI-NEXT: v_writelane_b32 v43, s90, 50
; SI-NEXT: v_writelane_b32 v43, s91, 51
@@ -149710,15 +150725,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v43, s94, 54
; SI-NEXT: v_writelane_b32 v43, s95, 55
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s62, v33
+; SI-NEXT: v_readfirstlane_b32 s26, v33
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s10, v34
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s66, v35
-; SI-NEXT: v_readfirstlane_b32 s28, v31
+; SI-NEXT: v_readfirstlane_b32 s23, v31
; SI-NEXT: v_readfirstlane_b32 s27, v32
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s58, v36
+; SI-NEXT: v_readfirstlane_b32 s29, v36
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s69, v37
; SI-NEXT: s_waitcnt vmcnt(5)
@@ -149758,9 +150773,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v43, s36, 62
; SI-NEXT: v_writelane_b32 v43, s37, 63
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s74, v31
+; SI-NEXT: v_readfirstlane_b32 s21, v31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s46, v32
+; SI-NEXT: v_readfirstlane_b32 s45, v32
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s96, v33
; SI-NEXT: s_waitcnt vmcnt(9)
@@ -149768,7 +150783,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s41, v35
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s56, v36
+; SI-NEXT: v_readfirstlane_b32 s72, v36
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s87, v37
; SI-NEXT: s_waitcnt vmcnt(5)
@@ -149784,7 +150799,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s26, v48
+; SI-NEXT: v_readfirstlane_b32 s47, v48
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s83, v49
; SI-NEXT: s_waitcnt vmcnt(9)
@@ -149843,25 +150858,25 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v42, s41, 28
; SI-NEXT: v_writelane_b32 v42, s80, 29
; SI-NEXT: v_writelane_b32 v42, s7, 30
-; SI-NEXT: v_writelane_b32 v42, s56, 31
-; SI-NEXT: v_writelane_b32 v42, s26, 32
+; SI-NEXT: v_writelane_b32 v42, s72, 31
+; SI-NEXT: v_writelane_b32 v42, s47, 32
; SI-NEXT: v_writelane_b32 v42, s15, 33
; SI-NEXT: v_writelane_b32 v42, s14, 34
; SI-NEXT: v_writelane_b32 v42, s69, 35
; SI-NEXT: v_writelane_b32 v42, s71, 36
; SI-NEXT: v_writelane_b32 v42, s70, 37
; SI-NEXT: v_writelane_b32 v42, s68, 38
-; SI-NEXT: v_writelane_b32 v42, s74, 39
-; SI-NEXT: v_writelane_b32 v42, s46, 40
+; SI-NEXT: v_writelane_b32 v42, s21, 39
+; SI-NEXT: v_writelane_b32 v42, s45, 40
; SI-NEXT: v_writelane_b32 v42, s11, 41
; SI-NEXT: v_writelane_b32 v42, s10, 42
-; SI-NEXT: v_writelane_b32 v42, s62, 43
+; SI-NEXT: v_writelane_b32 v42, s26, 43
; SI-NEXT: v_writelane_b32 v42, s66, 44
-; SI-NEXT: v_writelane_b32 v42, s58, 45
-; SI-NEXT: v_writelane_b32 v42, s28, 46
+; SI-NEXT: v_writelane_b32 v42, s29, 45
+; SI-NEXT: v_writelane_b32 v42, s23, 46
; SI-NEXT: v_writelane_b32 v42, s27, 47
-; SI-NEXT: v_writelane_b32 v42, s78, 48
-; SI-NEXT: v_writelane_b32 v42, s24, 49
+; SI-NEXT: v_writelane_b32 v42, s46, 48
+; SI-NEXT: v_writelane_b32 v42, s13, 49
; SI-NEXT: s_cbranch_scc0 .LBB89_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_readlane_b32 s4, v43, 3
@@ -149870,66 +150885,65 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s5, 24
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_writelane_b32 v42, s4, 56
+; SI-NEXT: v_writelane_b32 v42, s4, 50
; SI-NEXT: v_readlane_b32 s4, v43, 1
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: v_readlane_b32 s5, v43, 0
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s5, 24
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_writelane_b32 v42, s4, 57
+; SI-NEXT: v_writelane_b32 v42, s4, 51
; SI-NEXT: s_and_b32 s4, s20, 0xff
-; SI-NEXT: s_lshl_b32 s5, s72, 8
+; SI-NEXT: s_lshl_b32 s5, s63, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: s_and_b32 s5, s22, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
; SI-NEXT: s_mov_b32 s22, s6
-; SI-NEXT: s_lshl_b32 s6, s23, 24
-; SI-NEXT: v_writelane_b32 v42, s4, 58
+; SI-NEXT: s_lshl_b32 s6, s88, 24
+; SI-NEXT: v_writelane_b32 v42, s4, 52
; SI-NEXT: s_or_b32 s4, s6, s5
-; SI-NEXT: s_and_b32 s5, s60, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s6, s25, 24
-; SI-NEXT: v_writelane_b32 v42, s4, 59
-; SI-NEXT: s_or_b32 s5, s6, s5
-; SI-NEXT: v_writelane_b32 v42, s5, 60
-; SI-NEXT: s_and_b32 s5, s79, 0xff
+; SI-NEXT: s_and_b32 s5, s24, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s6, s76, 24
-; SI-NEXT: s_or_b32 s5, s6, s5
-; SI-NEXT: v_writelane_b32 v42, s5, 61
+; SI-NEXT: s_lshl_b32 s6, s79, 24
+; SI-NEXT: v_writelane_b32 v42, s4, 53
+; SI-NEXT: s_or_b32 s4, s6, s5
; SI-NEXT: s_and_b32 s5, s77, 0xff
-; SI-NEXT: s_lshl_b32 s6, s29, 8
+; SI-NEXT: s_lshl_b32 s5, s5, 16
+; SI-NEXT: s_lshl_b32 s6, s61, 24
+; SI-NEXT: v_writelane_b32 v42, s4, 54
+; SI-NEXT: s_or_b32 s4, s6, s5
+; SI-NEXT: s_and_b32 s5, s73, 0xff
+; SI-NEXT: s_lshl_b32 s6, s58, 8
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_and_b32 s6, s16, 0xff
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_lshl_b32 s16, s17, 24
-; SI-NEXT: s_or_b32 s6, s16, s6
-; SI-NEXT: v_writelane_b32 v42, s6, 62
+; SI-NEXT: v_writelane_b32 v42, s4, 55
+; SI-NEXT: s_or_b32 s4, s16, s6
; SI-NEXT: s_and_b32 s6, s89, 0xff
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_lshl_b32 s16, s88, 24
-; SI-NEXT: s_mov_b32 s4, s47
-; SI-NEXT: s_or_b32 s47, s16, s6
+; SI-NEXT: s_lshl_b32 s16, s78, 24
+; SI-NEXT: v_writelane_b32 v42, s4, 56
+; SI-NEXT: s_or_b32 s4, s16, s6
; SI-NEXT: s_and_b32 s6, s18, 0xff
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_lshl_b32 s16, s19, 24
-; SI-NEXT: s_or_b32 s25, s16, s6
+; SI-NEXT: s_or_b32 s24, s16, s6
; SI-NEXT: s_and_b32 s6, s93, 0xff
; SI-NEXT: s_lshl_b32 s16, s92, 8
; SI-NEXT: s_or_b32 s6, s6, s16
; SI-NEXT: s_and_b32 s16, s90, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_lshl_b32 s17, s91, 24
-; SI-NEXT: s_or_b32 s92, s17, s16
+; SI-NEXT: s_or_b32 s78, s17, s16
; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24
-; SI-NEXT: s_or_b32 s76, s17, s16
+; SI-NEXT: s_or_b32 s79, s17, s16
; SI-NEXT: s_and_b32 s16, s94, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_lshl_b32 s17, s95, 24
-; SI-NEXT: s_or_b32 s91, s17, s16
+; SI-NEXT: s_or_b32 s73, s17, s16
; SI-NEXT: s_and_b32 s16, s35, 0xff
; SI-NEXT: s_lshl_b32 s17, s34, 8
; SI-NEXT: s_or_b32 s16, s16, s17
@@ -149940,33 +150954,35 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s17, s39, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 16
; SI-NEXT: s_lshl_b32 s18, s38, 24
-; SI-NEXT: s_or_b32 s79, s18, s17
+; SI-NEXT: s_or_b32 s88, s18, s17
; SI-NEXT: s_and_b32 s17, s36, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 16
; SI-NEXT: s_lshl_b32 s18, s37, 24
-; SI-NEXT: s_or_b32 s93, s18, s17
+; SI-NEXT: s_or_b32 s89, s18, s17
; SI-NEXT: s_and_b32 s17, s51, 0xff
; SI-NEXT: s_lshl_b32 s18, s50, 8
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_and_b32 s18, s48, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 16
; SI-NEXT: s_lshl_b32 s19, s49, 24
-; SI-NEXT: s_or_b32 s89, s19, s18
+; SI-NEXT: v_writelane_b32 v42, s4, 57
+; SI-NEXT: s_mov_b32 s4, s59
+; SI-NEXT: s_or_b32 s59, s19, s18
; SI-NEXT: s_and_b32 s18, s55, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 16
; SI-NEXT: s_lshl_b32 s19, s54, 24
-; SI-NEXT: s_or_b32 s31, s19, s18
+; SI-NEXT: s_or_b32 s61, s19, s18
; SI-NEXT: s_and_b32 s18, s52, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 16
; SI-NEXT: s_lshl_b32 s19, s53, 24
-; SI-NEXT: s_or_b32 s94, s19, s18
+; SI-NEXT: s_or_b32 s48, s19, s18
; SI-NEXT: s_and_b32 s18, s84, 0xff
; SI-NEXT: s_lshl_b32 s19, s67, 8
; SI-NEXT: s_or_b32 s18, s18, s19
; SI-NEXT: s_and_b32 s19, s64, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s65, 24
-; SI-NEXT: s_or_b32 s60, s20, s19
+; SI-NEXT: s_or_b32 s93, s20, s19
; SI-NEXT: s_and_b32 s19, s12, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s8, 24
@@ -149985,139 +151001,136 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s19, s15, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s7, 24
-; SI-NEXT: s_or_b32 s7, s20, s19
+; SI-NEXT: s_or_b32 s52, s20, s19
; SI-NEXT: s_and_b32 s19, s82, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s83, 24
-; SI-NEXT: s_or_b32 s23, s20, s19
-; SI-NEXT: s_and_b32 s19, s26, 0xff
+; SI-NEXT: s_or_b32 s53, s20, s19
+; SI-NEXT: s_and_b32 s19, s47, 0xff
; SI-NEXT: s_lshl_b32 s20, s81, 8
; SI-NEXT: s_or_b32 vcc_hi, s19, s20
; SI-NEXT: s_and_b32 s19, s99, 0xff
-; SI-NEXT: v_writelane_b32 v42, s9, 50
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s87, 24
-; SI-NEXT: v_writelane_b32 v42, s7, 51
-; SI-NEXT: s_or_b32 s7, s20, s19
-; SI-NEXT: s_and_b32 s19, s56, 0xff
+; SI-NEXT: s_or_b32 s54, s20, s19
+; SI-NEXT: s_and_b32 s19, s72, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s41, 24
-; SI-NEXT: v_writelane_b32 v42, s7, 52
-; SI-NEXT: s_or_b32 s7, s20, s19
+; SI-NEXT: s_or_b32 s49, s20, s19
; SI-NEXT: s_and_b32 s19, s98, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s96, 24
-; SI-NEXT: v_writelane_b32 v42, s7, 54
-; SI-NEXT: s_or_b32 s7, s20, s19
-; SI-NEXT: s_and_b32 s19, s46, 0xff
-; SI-NEXT: s_lshl_b32 s20, s74, 8
+; SI-NEXT: s_or_b32 s90, s20, s19
+; SI-NEXT: s_and_b32 s19, s45, 0xff
+; SI-NEXT: s_lshl_b32 s20, s21, 8
; SI-NEXT: s_or_b32 s84, s19, s20
; SI-NEXT: s_and_b32 s19, s71, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s70, 24
-; SI-NEXT: s_or_b32 s72, s20, s19
+; SI-NEXT: s_or_b32 s92, s20, s19
; SI-NEXT: s_and_b32 s19, s11, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s68, 24
-; SI-NEXT: v_writelane_b32 v42, s7, 53
; SI-NEXT: s_or_b32 s7, s20, s19
; SI-NEXT: s_and_b32 s19, s14, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s69, 24
-; SI-NEXT: s_or_b32 s9, s20, s19
-; SI-NEXT: s_and_b32 s19, s58, 0xff
+; SI-NEXT: s_or_b32 s58, s20, s19
+; SI-NEXT: s_and_b32 s19, s29, 0xff
; SI-NEXT: s_lshl_b32 s20, s66, 8
; SI-NEXT: s_or_b32 s85, s19, s20
; SI-NEXT: s_and_b32 s19, s10, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s62, 24
-; SI-NEXT: s_or_b32 s49, s20, s19
+; SI-NEXT: s_lshl_b32 s20, s26, 24
+; SI-NEXT: s_or_b32 s63, s20, s19
; SI-NEXT: s_and_b32 s19, s27, 0xff
-; SI-NEXT: v_writelane_b32 v42, s9, 55
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s28, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 33
-; SI-NEXT: s_or_b32 s50, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 32
+; SI-NEXT: s_lshl_b32 s20, s23, 24
+; SI-NEXT: v_readlane_b32 s11, v43, 33
+; SI-NEXT: s_or_b32 s10, s20, s19
+; SI-NEXT: s_and_b32 s19, s11, 0xff
+; SI-NEXT: v_readlane_b32 s11, v43, 32
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 31
+; SI-NEXT: s_lshl_b32 s20, s11, 24
+; SI-NEXT: v_readlane_b32 s11, v43, 31
; SI-NEXT: s_or_b32 s51, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 30
-; SI-NEXT: s_lshl_b32 s20, s9, 8
-; SI-NEXT: v_readlane_b32 s9, v43, 29
+; SI-NEXT: s_and_b32 s19, s11, 0xff
+; SI-NEXT: v_readlane_b32 s11, v43, 30
+; SI-NEXT: s_lshl_b32 s20, s11, 8
+; SI-NEXT: v_readlane_b32 s11, v43, 29
; SI-NEXT: s_or_b32 s86, s19, s20
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 28
+; SI-NEXT: s_and_b32 s19, s11, 0xff
+; SI-NEXT: v_readlane_b32 s11, v43, 28
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 27
-; SI-NEXT: s_or_b32 s52, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 26
+; SI-NEXT: s_lshl_b32 s20, s11, 24
+; SI-NEXT: v_readlane_b32 s11, v43, 27
+; SI-NEXT: s_or_b32 s55, s20, s19
+; SI-NEXT: s_and_b32 s19, s11, 0xff
+; SI-NEXT: v_readlane_b32 s11, v43, 26
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 25
-; SI-NEXT: s_or_b32 s53, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 24
+; SI-NEXT: s_lshl_b32 s20, s11, 24
+; SI-NEXT: v_readlane_b32 s11, v43, 25
+; SI-NEXT: s_or_b32 s14, s20, s19
+; SI-NEXT: s_and_b32 s19, s11, 0xff
+; SI-NEXT: v_readlane_b32 s11, v43, 24
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 23
-; SI-NEXT: s_or_b32 s54, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 22
-; SI-NEXT: s_lshl_b32 s20, s9, 8
-; SI-NEXT: v_readlane_b32 s9, v43, 21
+; SI-NEXT: s_lshl_b32 s20, s11, 24
+; SI-NEXT: v_readlane_b32 s11, v43, 23
+; SI-NEXT: s_or_b32 s64, s20, s19
+; SI-NEXT: s_and_b32 s19, s11, 0xff
+; SI-NEXT: v_readlane_b32 s11, v43, 22
+; SI-NEXT: s_lshl_b32 s20, s11, 8
+; SI-NEXT: v_readlane_b32 s11, v43, 21
; SI-NEXT: s_or_b32 s87, s19, s20
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 20
+; SI-NEXT: s_and_b32 s19, s11, 0xff
+; SI-NEXT: v_readlane_b32 s11, v43, 20
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v43, 19
-; SI-NEXT: s_or_b32 s55, s20, s19
-; SI-NEXT: s_mov_b32 s58, s9
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 18
+; SI-NEXT: s_lshl_b32 s20, s11, 24
+; SI-NEXT: v_readlane_b32 s11, v43, 19
+; SI-NEXT: s_or_b32 s65, s20, s19
+; SI-NEXT: s_and_b32 s19, s11, 0xff
+; SI-NEXT: v_readlane_b32 s11, v43, 18
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: s_or_b32 s64, s20, s19
-; SI-NEXT: s_and_b32 s19, s78, 0xff
+; SI-NEXT: s_lshl_b32 s20, s11, 24
+; SI-NEXT: v_readlane_b32 s11, v43, 17
+; SI-NEXT: s_or_b32 s15, s20, s19
+; SI-NEXT: s_and_b32 s19, s11, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s24, 24
-; SI-NEXT: s_or_b32 s65, s20, s19
-; SI-NEXT: s_and_b32 s19, s4, 0xff
-; SI-NEXT: s_lshl_b32 s20, s45, 8
+; SI-NEXT: s_lshl_b32 s20, s46, 24
+; SI-NEXT: s_or_b32 s66, s20, s19
+; SI-NEXT: s_and_b32 s19, s25, 0xff
+; SI-NEXT: s_lshl_b32 s20, s76, 8
; SI-NEXT: s_or_b32 s26, s19, s20
; SI-NEXT: s_and_b32 s19, s13, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s57, 24
-; SI-NEXT: s_or_b32 s66, s20, s19
-; SI-NEXT: s_and_b32 s19, s21, 0xff
-; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s73, 24
+; SI-NEXT: s_lshl_b32 s20, s4, 24
; SI-NEXT: s_or_b32 s67, s20, s19
-; SI-NEXT: s_and_b32 s19, s42, 0xff
-; SI-NEXT: v_readlane_b32 s88, v43, 17
+; SI-NEXT: s_and_b32 s19, s57, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s59, 24
+; SI-NEXT: s_lshl_b32 s20, s28, 24
; SI-NEXT: s_or_b32 s68, s20, s19
-; SI-NEXT: s_and_b32 s19, s63, 0xff
-; SI-NEXT: s_lshl_b32 s20, s88, 8
+; SI-NEXT: s_and_b32 s19, s60, 0xff
+; SI-NEXT: s_lshl_b32 s19, s19, 16
+; SI-NEXT: s_lshl_b32 s20, s74, 24
+; SI-NEXT: s_or_b32 s69, s20, s19
+; SI-NEXT: s_and_b32 s19, s62, 0xff
+; SI-NEXT: s_lshl_b32 s20, s42, 8
; SI-NEXT: s_or_b32 s27, s19, s20
; SI-NEXT: s_and_b32 s19, s40, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s43, 24
-; SI-NEXT: s_or_b32 s69, s20, s19
-; SI-NEXT: s_and_b32 s19, s61, 0xff
-; SI-NEXT: s_mov_b32 s39, s57
+; SI-NEXT: s_or_b32 s70, s20, s19
+; SI-NEXT: s_and_b32 s19, s75, 0xff
+; SI-NEXT: s_mov_b32 s34, s57
; SI-NEXT: s_mov_b32 s57, s7
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s75, 24
+; SI-NEXT: s_lshl_b32 s20, s56, 24
; SI-NEXT: v_readlane_b32 s7, v43, 16
-; SI-NEXT: s_or_b32 s70, s20, s19
+; SI-NEXT: s_mov_b32 s95, s42
+; SI-NEXT: s_mov_b32 s42, s40
+; SI-NEXT: s_mov_b32 s35, s56
+; SI-NEXT: s_mov_b32 s56, s10
+; SI-NEXT: s_or_b32 s40, s20, s19
; SI-NEXT: s_mov_b32 s10, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
; SI-NEXT: v_readlane_b32 s7, v43, 15
@@ -150125,23 +151138,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_mov_b32 s71, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
; SI-NEXT: v_readlane_b32 s7, v43, 14
-; SI-NEXT: s_or_b32 s62, s20, s19
+; SI-NEXT: s_mov_b32 s13, s93
+; SI-NEXT: s_mov_b32 s36, s43
+; SI-NEXT: s_mov_b32 s43, s15
+; SI-NEXT: s_or_b32 s93, s20, s19
; SI-NEXT: s_mov_b32 s15, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
; SI-NEXT: v_readlane_b32 s7, v43, 13
; SI-NEXT: s_mov_b32 s41, s7
; SI-NEXT: s_lshl_b32 s20, s7, 8
; SI-NEXT: v_readlane_b32 s7, v43, 12
+; SI-NEXT: s_mov_b32 s46, s14
; SI-NEXT: s_or_b32 s29, s19, s20
; SI-NEXT: s_mov_b32 s14, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
; SI-NEXT: v_readlane_b32 s7, v43, 11
+; SI-NEXT: s_mov_b32 s47, s76
+; SI-NEXT: s_mov_b32 s76, s9
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
; SI-NEXT: v_readlane_b32 s7, v43, 10
; SI-NEXT: s_or_b32 s80, s20, s19
-; SI-NEXT: s_mov_b32 s56, s7
+; SI-NEXT: s_mov_b32 s72, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
; SI-NEXT: v_readlane_b32 s7, v43, 9
; SI-NEXT: s_lshl_b32 s19, s19, 16
@@ -150156,43 +151175,32 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_mov_b32 s96, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
; SI-NEXT: v_readlane_b32 s7, v43, 6
-; SI-NEXT: s_mov_b32 s36, s63
-; SI-NEXT: s_mov_b32 s63, s93
-; SI-NEXT: s_mov_b32 s93, s61
-; SI-NEXT: s_mov_b32 s61, s91
-; SI-NEXT: s_mov_b32 s91, s75
-; SI-NEXT: s_mov_b32 s75, s92
-; SI-NEXT: s_or_b32 s92, s20, s19
+; SI-NEXT: s_or_b32 s45, s20, s19
; SI-NEXT: s_mov_b32 s98, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
; SI-NEXT: v_readlane_b32 s7, v43, 5
; SI-NEXT: s_mov_b32 s44, s7
; SI-NEXT: s_lshl_b32 s20, s7, 8
; SI-NEXT: v_readlane_b32 s7, v43, 4
-; SI-NEXT: s_mov_b32 s48, s13
-; SI-NEXT: s_mov_b32 s13, s94
-; SI-NEXT: s_mov_b32 s94, s21
; SI-NEXT: s_or_b32 s21, s19, s20
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: s_mov_b32 s95, s4
+; SI-NEXT: s_mov_b32 s39, s4
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s22, 24
-; SI-NEXT: v_readlane_b32 s4, v42, 58
-; SI-NEXT: s_mov_b32 s46, s45
-; SI-NEXT: s_mov_b32 s34, s73
-; SI-NEXT: s_mov_b32 s73, s12
-; SI-NEXT: s_mov_b32 s37, s42
-; SI-NEXT: s_mov_b32 s38, s59
-; SI-NEXT: s_mov_b32 s59, s8
-; SI-NEXT: s_mov_b32 s30, s88
-; SI-NEXT: s_mov_b32 s88, s31
-; SI-NEXT: s_mov_b32 s78, s40
-; SI-NEXT: s_mov_b32 s31, s43
+; SI-NEXT: v_readlane_b32 s4, v42, 52
+; SI-NEXT: s_mov_b32 s91, s25
+; SI-NEXT: s_mov_b32 s31, s28
+; SI-NEXT: s_mov_b32 s50, s60
+; SI-NEXT: s_mov_b32 s60, s8
+; SI-NEXT: s_mov_b32 s94, s74
+; SI-NEXT: s_mov_b32 s74, s12
+; SI-NEXT: s_mov_b32 s38, s62
+; SI-NEXT: s_mov_b32 s37, s75
; SI-NEXT: s_mov_b32 s12, s7
; SI-NEXT: s_mov_b32 s7, s22
; SI-NEXT: s_or_b32 s83, s20, s19
; SI-NEXT: s_lshl_b32 s20, s4, 16
-; SI-NEXT: s_lshl_b32 s74, s5, 16
+; SI-NEXT: s_lshl_b32 s62, s5, 16
; SI-NEXT: s_lshl_b32 s22, s6, 16
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_lshl_b32 s19, s17, 16
@@ -150204,16 +151212,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_lshl_b32 s97, s86, 16
; SI-NEXT: s_lshl_b32 s28, s87, 16
; SI-NEXT: s_lshl_b32 s87, s26, 16
-; SI-NEXT: v_readlane_b32 s26, v42, 56
+; SI-NEXT: v_readlane_b32 s26, v42, 50
; SI-NEXT: s_lshl_b32 s86, s27, 16
-; SI-NEXT: v_readlane_b32 s27, v42, 57
-; SI-NEXT: v_readlane_b32 s35, v42, 61
+; SI-NEXT: v_readlane_b32 s27, v42, 51
+; SI-NEXT: v_readlane_b32 s30, v42, 56
; SI-NEXT: s_lshl_b32 s85, s29, 16
-; SI-NEXT: v_readlane_b32 s29, v42, 60
-; SI-NEXT: v_readlane_b32 s24, v42, 59
-; SI-NEXT: v_readlane_b32 s90, v42, 62
+; SI-NEXT: v_readlane_b32 s29, v42, 53
+; SI-NEXT: v_readlane_b32 s25, v42, 55
+; SI-NEXT: v_readlane_b32 s23, v42, 54
; SI-NEXT: s_lshl_b32 s84, s21, 16
-; SI-NEXT: s_mov_b32 s21, s47
+; SI-NEXT: v_readlane_b32 s21, v42, 57
; SI-NEXT: s_cbranch_execnz .LBB89_3
; SI-NEXT: .LBB89_2: ; %cmp.true
; SI-NEXT: s_add_i32 s4, s98, 3
@@ -150228,7 +151236,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: s_add_i32 s5, s56, 3
+; SI-NEXT: s_add_i32 s5, s72, 3
; SI-NEXT: s_and_b32 s5, s5, 0xff
; SI-NEXT: s_lshl_b32 s6, s81, 8
; SI-NEXT: s_add_i32 s16, s82, 3
@@ -150252,9 +151260,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: s_and_b32 s6, s6, 0xffff
; SI-NEXT: s_or_b32 s6, s16, s6
-; SI-NEXT: s_add_i32 s16, s93, 3
+; SI-NEXT: s_add_i32 s16, s37, 3
; SI-NEXT: s_and_b32 s16, s16, 0xff
-; SI-NEXT: s_lshl_b32 s17, s91, 8
+; SI-NEXT: s_lshl_b32 s17, s35, 8
; SI-NEXT: s_add_i32 s18, s10, 3
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_and_b32 s18, s18, 0xff
@@ -150264,34 +151272,35 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_and_b32 s16, s16, 0xffff
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: s_add_i32 s17, s36, 3
+; SI-NEXT: s_add_i32 s17, s38, 3
; SI-NEXT: s_and_b32 s17, s17, 0xff
-; SI-NEXT: s_lshl_b32 s18, s30, 8
-; SI-NEXT: s_add_i32 s19, s78, 3
+; SI-NEXT: s_lshl_b32 s18, s95, 8
+; SI-NEXT: s_add_i32 s19, s42, 3
; SI-NEXT: s_or_b32 s17, s18, s17
; SI-NEXT: s_and_b32 s19, s19, 0xff
-; SI-NEXT: s_lshl_b32 s18, s31, 24
+; SI-NEXT: s_lshl_b32 s18, s36, 24
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_addk_i32 s17, 0x300
; SI-NEXT: s_or_b32 s18, s18, s19
; SI-NEXT: s_and_b32 s17, s17, 0xffff
; SI-NEXT: s_or_b32 s17, s18, s17
-; SI-NEXT: s_add_i32 s18, s94, 3
+; SI-NEXT: s_add_i32 s18, s34, 3
; SI-NEXT: s_and_b32 s18, s18, 0xff
-; SI-NEXT: s_lshl_b32 s19, s34, 8
-; SI-NEXT: s_add_i32 s20, s37, 3
+; SI-NEXT: s_lshl_b32 s19, s31, 8
+; SI-NEXT: s_add_i32 s20, s50, 3
; SI-NEXT: s_or_b32 s18, s19, s18
; SI-NEXT: s_and_b32 s20, s20, 0xff
-; SI-NEXT: s_lshl_b32 s19, s38, 24
+; SI-NEXT: s_lshl_b32 s19, s94, 24
; SI-NEXT: s_lshl_b32 s20, s20, 16
; SI-NEXT: s_addk_i32 s18, 0x300
; SI-NEXT: s_or_b32 s19, s19, s20
; SI-NEXT: s_and_b32 s18, s18, 0xffff
; SI-NEXT: s_or_b32 s18, s19, s18
-; SI-NEXT: s_add_i32 s19, s95, 3
+; SI-NEXT: s_add_i32 s19, s91, 3
+; SI-NEXT: v_readlane_b32 s7, v42, 49
; SI-NEXT: s_and_b32 s19, s19, 0xff
-; SI-NEXT: s_lshl_b32 s20, s46, 8
-; SI-NEXT: s_add_i32 s22, s48, 3
+; SI-NEXT: s_lshl_b32 s20, s47, 8
+; SI-NEXT: s_add_i32 s22, s7, 3
; SI-NEXT: s_or_b32 s19, s20, s19
; SI-NEXT: s_and_b32 s22, s22, 0xff
; SI-NEXT: s_lshl_b32 s20, s39, 24
@@ -150299,15 +151308,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_addk_i32 s19, 0x300
; SI-NEXT: s_or_b32 s20, s20, s22
; SI-NEXT: s_and_b32 s19, s19, 0xffff
+; SI-NEXT: v_readlane_b32 s7, v43, 19
; SI-NEXT: s_or_b32 s19, s20, s19
-; SI-NEXT: s_add_i32 s20, s58, 3
+; SI-NEXT: s_add_i32 s20, s7, 3
; SI-NEXT: v_readlane_b32 s7, v43, 18
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_lshl_b32 s22, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v42, 49
+; SI-NEXT: v_readlane_b32 s7, v42, 48
; SI-NEXT: s_or_b32 s20, s22, s20
; SI-NEXT: s_lshl_b32 s22, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v42, 48
+; SI-NEXT: v_readlane_b32 s7, v43, 17
; SI-NEXT: s_add_i32 s23, s7, 3
; SI-NEXT: s_and_b32 s23, s23, 0xff
; SI-NEXT: s_lshl_b32 s23, s23, 16
@@ -150688,53 +151698,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_lshl_b32 s25, s25, 16
; SI-NEXT: s_and_b32 s20, s20, 0xffff
; SI-NEXT: s_or_b32 s24, s24, s25
-; SI-NEXT: s_and_b32 s46, s46, 0xff
; SI-NEXT: s_or_b32 s20, s24, s20
; SI-NEXT: v_readlane_b32 s24, v43, 3
-; SI-NEXT: s_lshl_b32 s46, s46, 16
-; SI-NEXT: s_addk_i32 s56, 0x300
; SI-NEXT: s_add_i32 s24, s24, 3
; SI-NEXT: v_readlane_b32 s25, v43, 2
; SI-NEXT: v_readlane_b32 s26, v43, 1
-; SI-NEXT: s_or_b32 s46, s47, s46
-; SI-NEXT: s_and_b32 s47, s56, 0xffff
-; SI-NEXT: s_add_i32 s7, s7, 0x3000000
-; SI-NEXT: s_add_i32 s9, s9, 0x3000000
; SI-NEXT: s_and_b32 s24, s24, 0xff
; SI-NEXT: s_lshl_b32 s25, s25, 8
; SI-NEXT: s_add_i32 s26, s26, 3
-; SI-NEXT: s_or_b32 s56, s46, s47
-; SI-NEXT: s_add_i32 s47, s58, 0x3000000
-; SI-NEXT: s_add_i32 s58, s59, 0x3000000
-; SI-NEXT: s_add_i32 s10, s10, 0x3000000
+; SI-NEXT: s_and_b32 s46, s46, 0xff
; SI-NEXT: s_or_b32 s24, s25, s24
; SI-NEXT: v_readlane_b32 s25, v43, 0
; SI-NEXT: s_and_b32 s26, s26, 0xff
-; SI-NEXT: s_and_b32 s73, s9, 0xffff0000
-; SI-NEXT: s_lshl_b32 s59, s9, 16
-; SI-NEXT: s_and_b32 s9, s7, 0xffff0000
-; SI-NEXT: s_add_i32 s6, s6, 0x3000000
+; SI-NEXT: s_lshl_b32 s46, s46, 16
+; SI-NEXT: s_addk_i32 s56, 0x300
; SI-NEXT: s_addk_i32 s24, 0x300
; SI-NEXT: s_lshl_b32 s25, s25, 24
; SI-NEXT: s_lshl_b32 s26, s26, 16
-; SI-NEXT: s_and_b32 s63, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s79, s17, 16
-; SI-NEXT: v_writelane_b32 v42, s9, 50
-; SI-NEXT: s_lshl_b32 s17, s7, 16
-; SI-NEXT: s_lshl_b32 s7, s10, 16
-; SI-NEXT: s_add_i32 s8, s8, 0x3000000
+; SI-NEXT: s_or_b32 s46, s47, s46
+; SI-NEXT: s_and_b32 s47, s56, 0xffff
; SI-NEXT: s_and_b32 s24, s24, 0xffff
; SI-NEXT: s_or_b32 s25, s25, s26
-; SI-NEXT: v_writelane_b32 v42, s7, 51
-; SI-NEXT: s_and_b32 s7, s6, 0xffff0000
+; SI-NEXT: s_or_b32 s56, s46, s47
; SI-NEXT: s_or_b32 s24, s25, s24
-; SI-NEXT: v_writelane_b32 v42, s7, 52
-; SI-NEXT: s_and_b32 s7, s8, 0xffff0000
; SI-NEXT: s_add_i32 s4, s4, 0x3000000
; SI-NEXT: s_add_i32 s5, s5, 0x3000000
; SI-NEXT: s_add_i32 s46, s60, 0x3000000
+; SI-NEXT: s_add_i32 s47, s58, 0x3000000
; SI-NEXT: s_add_i32 s56, s56, 0x3000000
; SI-NEXT: s_add_i32 s57, s57, 0x3000000
+; SI-NEXT: s_add_i32 s58, s59, 0x3000000
+; SI-NEXT: s_add_i32 s8, s8, 0x3000000
+; SI-NEXT: s_add_i32 s6, s6, 0x3000000
+; SI-NEXT: s_add_i32 s10, s10, 0x3000000
+; SI-NEXT: s_add_i32 s7, s7, 0x3000000
+; SI-NEXT: s_add_i32 s9, s9, 0x3000000
; SI-NEXT: s_add_i32 s11, s11, 0x3000000
; SI-NEXT: s_add_i32 s12, s12, 0x3000000
; SI-NEXT: s_add_i32 s13, s13, 0x3000000
@@ -150743,291 +151741,323 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_add_i32 s19, s19, 0x3000000
; SI-NEXT: s_add_i32 s20, s20, 0x3000000
; SI-NEXT: s_add_i32 s24, s24, 0x3000000
-; SI-NEXT: v_writelane_b32 v42, s7, 53
-; SI-NEXT: s_lshl_b32 s7, s8, 16
; SI-NEXT: s_and_b32 s27, s24, 0xffff0000
; SI-NEXT: s_lshl_b32 s26, s24, 16
-; SI-NEXT: s_and_b32 s24, s20, 0xffff0000
+; SI-NEXT: s_and_b32 s29, s20, 0xffff0000
; SI-NEXT: s_lshl_b32 s20, s20, 16
-; SI-NEXT: s_and_b32 s35, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s29, s23, 16
-; SI-NEXT: s_and_b32 s90, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s74, s22, 16
-; SI-NEXT: s_and_b32 s25, s21, 0xffff0000
+; SI-NEXT: s_and_b32 s25, s23, 0xffff0000
+; SI-NEXT: s_lshl_b32 s23, s23, 16
+; SI-NEXT: s_and_b32 s30, s22, 0xffff0000
+; SI-NEXT: s_lshl_b32 s62, s22, 16
+; SI-NEXT: s_and_b32 s24, s21, 0xffff0000
; SI-NEXT: s_lshl_b32 s21, s21, 16
-; SI-NEXT: s_and_b32 s75, s19, 0xffff0000
+; SI-NEXT: s_and_b32 s78, s19, 0xffff0000
; SI-NEXT: s_lshl_b32 s22, s19, 16
-; SI-NEXT: s_and_b32 s61, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s76, s18, 16
+; SI-NEXT: s_and_b32 s73, s18, 0xffff0000
+; SI-NEXT: s_lshl_b32 s79, s18, 16
; SI-NEXT: s_and_b32 s77, s16, 0xffff0000
; SI-NEXT: s_lshl_b32 s16, s16, 16
-; SI-NEXT: s_and_b32 s89, s13, 0xffff0000
+; SI-NEXT: s_and_b32 s89, s17, 0xffff0000
+; SI-NEXT: s_lshl_b32 s88, s17, 16
+; SI-NEXT: s_and_b32 s59, s13, 0xffff0000
; SI-NEXT: s_lshl_b32 s19, s13, 16
-; SI-NEXT: s_and_b32 s13, s12, 0xffff0000
-; SI-NEXT: s_lshl_b32 s88, s12, 16
-; SI-NEXT: s_and_b32 s60, s11, 0xffff0000
+; SI-NEXT: s_and_b32 s48, s12, 0xffff0000
+; SI-NEXT: s_lshl_b32 s61, s12, 16
+; SI-NEXT: s_and_b32 s13, s11, 0xffff0000
; SI-NEXT: s_lshl_b32 s18, s11, 16
-; SI-NEXT: s_and_b32 s23, s10, 0xffff0000
+; SI-NEXT: s_and_b32 s74, s9, 0xffff0000
+; SI-NEXT: s_lshl_b32 s60, s9, 16
+; SI-NEXT: s_and_b32 s76, s7, 0xffff0000
+; SI-NEXT: s_lshl_b32 s17, s7, 16
+; SI-NEXT: s_and_b32 s53, s10, 0xffff0000
+; SI-NEXT: s_lshl_b32 s52, s10, 16
+; SI-NEXT: s_and_b32 s54, s6, 0xffff0000
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: v_writelane_b32 v42, s7, 54
-; SI-NEXT: s_and_b32 s72, s58, 0xffff0000
+; SI-NEXT: s_and_b32 s90, s8, 0xffff0000
+; SI-NEXT: s_lshl_b32 s49, s8, 16
+; SI-NEXT: s_and_b32 s92, s58, 0xffff0000
; SI-NEXT: s_lshl_b32 s99, s58, 16
-; SI-NEXT: s_and_b32 s7, s57, 0xffff0000
+; SI-NEXT: s_and_b32 s58, s57, 0xffff0000
; SI-NEXT: s_lshl_b32 s57, s57, 16
-; SI-NEXT: s_and_b32 s49, s56, 0xffff0000
+; SI-NEXT: s_and_b32 s63, s56, 0xffff0000
; SI-NEXT: s_lshl_b32 s8, s56, 16
; SI-NEXT: s_and_b32 s51, s47, 0xffff0000
-; SI-NEXT: s_lshl_b32 s50, s47, 16
-; SI-NEXT: s_and_b32 s52, s46, 0xffff0000
+; SI-NEXT: s_lshl_b32 s56, s47, 16
+; SI-NEXT: s_and_b32 s55, s46, 0xffff0000
; SI-NEXT: s_lshl_b32 s97, s46, 16
-; SI-NEXT: s_and_b32 s54, s45, 0xffff0000
-; SI-NEXT: s_lshl_b32 s53, s45, 16
-; SI-NEXT: s_and_b32 s55, s44, 0xffff0000
+; SI-NEXT: s_and_b32 s64, s45, 0xffff0000
+; SI-NEXT: s_lshl_b32 s46, s45, 16
+; SI-NEXT: s_and_b32 s65, s44, 0xffff0000
; SI-NEXT: s_lshl_b32 s28, s44, 16
-; SI-NEXT: s_and_b32 s65, s43, 0xffff0000
-; SI-NEXT: s_lshl_b32 s64, s43, 16
-; SI-NEXT: s_and_b32 s66, s42, 0xffff0000
+; SI-NEXT: s_and_b32 s66, s43, 0xffff0000
+; SI-NEXT: s_lshl_b32 s43, s43, 16
+; SI-NEXT: s_and_b32 s67, s42, 0xffff0000
; SI-NEXT: s_lshl_b32 s87, s42, 16
-; SI-NEXT: s_and_b32 s68, s41, 0xffff0000
-; SI-NEXT: s_lshl_b32 s67, s41, 16
-; SI-NEXT: s_and_b32 s69, s40, 0xffff0000
+; SI-NEXT: s_and_b32 s69, s41, 0xffff0000
+; SI-NEXT: s_lshl_b32 s68, s41, 16
+; SI-NEXT: s_and_b32 s70, s40, 0xffff0000
; SI-NEXT: s_lshl_b32 s86, s40, 16
-; SI-NEXT: s_and_b32 s62, s15, 0xffff0000
-; SI-NEXT: s_lshl_b32 s70, s15, 16
+; SI-NEXT: s_and_b32 s93, s15, 0xffff0000
+; SI-NEXT: s_lshl_b32 s40, s15, 16
; SI-NEXT: s_and_b32 s80, s14, 0xffff0000
; SI-NEXT: s_lshl_b32 s85, s14, 16
-; SI-NEXT: s_and_b32 s92, s5, 0xffff0000
+; SI-NEXT: s_and_b32 s45, s5, 0xffff0000
; SI-NEXT: s_lshl_b32 s11, s5, 16
; SI-NEXT: s_and_b32 s83, s4, 0xffff0000
; SI-NEXT: s_lshl_b32 s84, s4, 16
-; SI-NEXT: v_writelane_b32 v42, s7, 55
; SI-NEXT: .LBB89_3: ; %end
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s27
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s25
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s73
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s77
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s13
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
-; SI-NEXT: v_readlane_b32 s4, v42, 50
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23
-; SI-NEXT: v_readlane_b32 s4, v42, 51
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s53
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
-; SI-NEXT: v_readlane_b32 s4, v42, 52
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
-; SI-NEXT: v_readlane_b32 s4, v42, 53
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT: v_readlane_b32 s4, v42, 54
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
-; SI-NEXT: v_readlane_b32 s4, v42, 55
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s51
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s55
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s53
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s65
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s87
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s87
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s69
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s93
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s80
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s45
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s84
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s83
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
@@ -151075,103 +152105,95 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB89_4:
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; kill: killed $sgpr8
-; SI-NEXT: s_mov_b32 s7, s6
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; kill: killed $sgpr8
-; SI-NEXT: v_readlane_b32 s58, v43, 19
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: s_mov_b32 s95, s47
-; SI-NEXT: s_mov_b32 s94, s21
-; SI-NEXT: s_mov_b32 s93, s61
-; SI-NEXT: s_mov_b32 s34, s73
-; SI-NEXT: s_mov_b32 s91, s75
-; SI-NEXT: v_readlane_b32 s56, v43, 10
-; SI-NEXT: s_mov_b32 s36, s63
-; SI-NEXT: s_mov_b32 s38, s59
-; SI-NEXT: s_mov_b32 s37, s42
-; SI-NEXT: v_readlane_b32 s30, v43, 17
+; SI-NEXT: s_mov_b32 s91, s25
+; SI-NEXT: s_mov_b32 s34, s57
+; SI-NEXT: s_mov_b32 s37, s75
+; SI-NEXT: s_mov_b32 s31, s28
+; SI-NEXT: s_mov_b32 s35, s56
+; SI-NEXT: v_readlane_b32 s72, v43, 10
+; SI-NEXT: s_mov_b32 s38, s62
+; SI-NEXT: s_mov_b32 s94, s74
+; SI-NEXT: s_mov_b32 s50, s60
+; SI-NEXT: s_mov_b32 s95, s42
; SI-NEXT: v_readlane_b32 s98, v43, 6
-; SI-NEXT: s_mov_b32 s46, s45
-; SI-NEXT: s_mov_b32 s31, s43
-; SI-NEXT: s_mov_b32 s78, s40
+; SI-NEXT: s_mov_b32 s47, s76
+; SI-NEXT: s_mov_b32 s36, s43
+; SI-NEXT: s_mov_b32 s42, s40
; SI-NEXT: v_readlane_b32 s15, v43, 14
-; SI-NEXT: s_mov_b32 s39, s57
-; SI-NEXT: s_mov_b32 s48, s13
+; SI-NEXT: s_mov_b32 s39, s59
; SI-NEXT: v_readlane_b32 s41, v43, 13
; SI-NEXT: v_readlane_b32 s44, v43, 5
; SI-NEXT: v_readlane_b32 s9, v43, 11
; SI-NEXT: v_readlane_b32 s14, v43, 12
; SI-NEXT: v_readlane_b32 s81, v43, 9
; SI-NEXT: v_readlane_b32 s10, v43, 16
+; SI-NEXT: s_mov_b32 s7, s6
; SI-NEXT: v_readlane_b32 s12, v43, 4
; SI-NEXT: v_readlane_b32 s96, v43, 7
; SI-NEXT: v_readlane_b32 s82, v43, 8
; SI-NEXT: v_readlane_b32 s71, v43, 15
-; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; kill: killed $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr8
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; implicit-def: $sgpr27
; SI-NEXT: ; implicit-def: $sgpr20
-; SI-NEXT: ; implicit-def: $sgpr24
; SI-NEXT: ; implicit-def: $sgpr29
-; SI-NEXT: ; implicit-def: $sgpr35
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr90
-; SI-NEXT: ; implicit-def: $sgpr21
+; SI-NEXT: ; implicit-def: $sgpr23
; SI-NEXT: ; implicit-def: $sgpr25
+; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; implicit-def: $sgpr21
+; SI-NEXT: ; implicit-def: $sgpr24
; SI-NEXT: ; implicit-def: $sgpr22
-; SI-NEXT: ; implicit-def: $sgpr75
-; SI-NEXT: ; implicit-def: $sgpr76
-; SI-NEXT: ; implicit-def: $sgpr61
+; SI-NEXT: ; implicit-def: $sgpr78
+; SI-NEXT: ; implicit-def: $sgpr79
+; SI-NEXT: ; implicit-def: $sgpr73
; SI-NEXT: ; implicit-def: $sgpr16
; SI-NEXT: ; implicit-def: $sgpr77
-; SI-NEXT: ; implicit-def: $sgpr79
-; SI-NEXT: ; implicit-def: $sgpr63
-; SI-NEXT: ; implicit-def: $sgpr19
-; SI-NEXT: ; implicit-def: $sgpr89
; SI-NEXT: ; implicit-def: $sgpr88
-; SI-NEXT: ; implicit-def: $sgpr13
+; SI-NEXT: ; implicit-def: $sgpr89
+; SI-NEXT: ; implicit-def: $sgpr19
+; SI-NEXT: ; implicit-def: $sgpr59
+; SI-NEXT: ; implicit-def: $sgpr61
+; SI-NEXT: ; implicit-def: $sgpr48
; SI-NEXT: ; implicit-def: $sgpr18
+; SI-NEXT: ; implicit-def: $sgpr13
; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr59
-; SI-NEXT: ; implicit-def: $sgpr73
+; SI-NEXT: ; implicit-def: $sgpr74
; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: ; implicit-def: $sgpr23
+; SI-NEXT: ; implicit-def: $sgpr76
+; SI-NEXT: ; implicit-def: $sgpr52
+; SI-NEXT: ; implicit-def: $sgpr53
; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: ; implicit-def: $sgpr49
+; SI-NEXT: ; implicit-def: $sgpr90
; SI-NEXT: ; implicit-def: $sgpr99
-; SI-NEXT: ; implicit-def: $sgpr72
+; SI-NEXT: ; implicit-def: $sgpr92
; SI-NEXT: ; implicit-def: $sgpr57
-; SI-NEXT: ; kill: killed $sgpr8
+; SI-NEXT: ; implicit-def: $sgpr58
; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr49
-; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: ; implicit-def: $sgpr63
+; SI-NEXT: ; implicit-def: $sgpr56
; SI-NEXT: ; implicit-def: $sgpr51
; SI-NEXT: ; implicit-def: $sgpr97
-; SI-NEXT: ; implicit-def: $sgpr52
-; SI-NEXT: ; implicit-def: $sgpr53
-; SI-NEXT: ; implicit-def: $sgpr54
-; SI-NEXT: ; implicit-def: $sgpr28
; SI-NEXT: ; implicit-def: $sgpr55
+; SI-NEXT: ; implicit-def: $sgpr46
; SI-NEXT: ; implicit-def: $sgpr64
+; SI-NEXT: ; implicit-def: $sgpr28
; SI-NEXT: ; implicit-def: $sgpr65
-; SI-NEXT: ; implicit-def: $sgpr87
+; SI-NEXT: ; implicit-def: $sgpr43
; SI-NEXT: ; implicit-def: $sgpr66
+; SI-NEXT: ; implicit-def: $sgpr87
; SI-NEXT: ; implicit-def: $sgpr67
; SI-NEXT: ; implicit-def: $sgpr68
-; SI-NEXT: ; implicit-def: $sgpr86
; SI-NEXT: ; implicit-def: $sgpr69
+; SI-NEXT: ; implicit-def: $sgpr86
; SI-NEXT: ; implicit-def: $sgpr70
-; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr40
+; SI-NEXT: ; implicit-def: $sgpr93
; SI-NEXT: ; implicit-def: $sgpr85
; SI-NEXT: ; implicit-def: $sgpr80
; SI-NEXT: ; implicit-def: $sgpr11
-; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr45
; SI-NEXT: ; implicit-def: $sgpr84
; SI-NEXT: ; implicit-def: $sgpr83
; SI-NEXT: s_branch .LBB89_2
@@ -154937,1990 +155959,2106 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56
+; SI-NEXT: s_waitcnt expcnt(6)
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v24
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v61, 1.0, v36
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v35
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v62, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v9, 1.0, v56
-; SI-NEXT: v_mul_f32_e32 v59, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v58
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; kill: killed $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v48
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v49
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v53
-; SI-NEXT: v_mul_f32_e32 v46, 1.0, v55
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v63
-; SI-NEXT: ; kill: killed $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v58
+; SI-NEXT: v_mul_f32_e32 v62, 1.0, v59
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v33
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v63
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; kill: killed $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v46
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v47
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v57
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v60
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v61
+; SI-NEXT: v_mul_f32_e32 v52, 1.0, v35
+; SI-NEXT: v_mul_f32_e32 v46, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v37
+; SI-NEXT: ; kill: killed $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr19
; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; kill: killed $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; kill: killed $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v3
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132
+; SI-NEXT: v_mul_f32_e32 v59, 1.0, v3
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v4
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v63, 1.0, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v5
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v7
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; kill: killed $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v6
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v7
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10
-; SI-NEXT: ; kill: killed $vgpr7
+; SI-NEXT: v_mul_f32_e32 v61, 1.0, v8
; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; kill: killed $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v3
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB90_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_alignbit_b32 v38, v1, v2, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v31, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v32, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v35, v1, v2, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v29, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v30, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v32, v1, v2, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v27, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v28, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v29, v1, v2, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v25, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v26, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v26, v1, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v23, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v24, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_alignbit_b32 v40, v1, v2, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v21, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v22, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v23, v1, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v19, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v20, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_alignbit_b32 v54, v1, v2, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v17, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v18, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_alignbit_b32 v18, v1, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v15, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v53
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v16, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_alignbit_b32 v52, v1, v2, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v13, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v14, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v15, v1, v57, 16
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v50, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v11, v2, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v12, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v9, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62
+; SI-NEXT: v_or_b32_e32 v10, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40
+; SI-NEXT: v_or_b32_e32 v7, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
+; SI-NEXT: v_or_b32_e32 v8, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v45
+; SI-NEXT: v_or_b32_e32 v5, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59
+; SI-NEXT: v_or_b32_e32 v6, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57
+; SI-NEXT: v_or_b32_e32 v3, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47
+; SI-NEXT: v_or_b32_e32 v4, v2, v4
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61
+; SI-NEXT: v_or_b32_e32 v35, v2, v33
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v42
+; SI-NEXT: v_or_b32_e32 v2, v2, v33
+; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v37
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v13, v1, v61, 16
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_alignbit_b32 v48, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v37
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v11, v1, v60, 16
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_alignbit_b32 v37, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v39
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v8, v1, v51, 16
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_alignbit_b32 v34, v1, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v39
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46
-; SI-NEXT: v_alignbit_b32 v5, v1, v53, 16
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_alignbit_b32 v31, v1, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v63
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30
-; SI-NEXT: v_alignbit_b32 v4, v1, v9, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_alignbit_b32 v28, v1, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v63
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56
-; SI-NEXT: v_alignbit_b32 v3, v1, v41, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_alignbit_b32 v25, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63
-; SI-NEXT: v_alignbit_b32 v2, v1, v16, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
-; SI-NEXT: v_alignbit_b32 v22, v1, v7, 16
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT: v_alignbit_b32 v20, v7, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v48
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v44
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v50
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v51
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v53
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v43
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v56
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v62
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v52
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v59
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v36
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v42
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v26, v25, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v26, v25, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v26, v25, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v24, v23, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v24, v23, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v24, v23, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v22, v21, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v22, v21, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_alignbit_b32 v17, v7, v9, 16
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24
+; SI-NEXT: v_alignbit_b32 v1, v22, v21, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v20, v19, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16
-; SI-NEXT: v_alignbit_b32 v14, v7, v27, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v20, v19, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v20, v19, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v18, v17, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v18, v17, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16
-; SI-NEXT: v_alignbit_b32 v10, v7, v39, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v18, v17, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v44
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v7, v7, v55, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v8, v7, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v8, v7, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v8, v7, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v6, v5, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v6, v5, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v6, v5, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v4, v3, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v4, v3, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v4, v3, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v2, v35, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v2, v35, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v2, v35, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v58
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v33
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v28
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v47
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v57
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v42
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v59
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v20
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v49
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v51
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v62
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v45
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v12
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v24
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19
-; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v36
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v8
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v6
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v4
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v43
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v56
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; kill: killed $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; kill: killed $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; kill: killed $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; kill: killed $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: .LBB90_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB90_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v35, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v57
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v5
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v46
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v44
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v44
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v59
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v40
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v5, v8, v5, 16
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v8
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v8
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v38
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v34
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56
+; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v43
+; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v53
+; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v53
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v51
+; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v51
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50
+; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v49
+; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v23
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v49
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v48
+; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v48
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v11
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v54
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v34
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_alignbit_b32 v11, v14, v11, 16
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42
-; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v16
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v13
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v17, v17, v15, 16
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v37
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38
-; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v50
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v52
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49
-; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v50
-; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v52
-; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v54
-; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v40
-; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v51
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53
-; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v55
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v30
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v32
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v40
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v15
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v57
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v15, v20, v15, 16
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v20, v18, 16
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v18
-; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24
-; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v18
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT: v_alignbit_b32 v18, v23, v18, 16
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v21
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v23
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v45
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_alignbit_b32 v23, v26, v23, 16
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v26
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v62
-; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v26
-; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v24, v24, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; SI-NEXT: v_alignbit_b32 v26, v29, v26, 16
-; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v27
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v29
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v29
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v27, v1, v27
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_alignbit_b32 v29, v32, v29, 16
-; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v30
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v32
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v28, v1, v28
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v32
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v29, v1, v29
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_alignbit_b32 v32, v35, v32, 16
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v34, v35, v34, 16
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v33
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v35
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v30, v1, v30
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v35
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v31, v1, v31
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; SI-NEXT: v_alignbit_b32 v35, v38, v35, 16
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v36
-; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v37, v38, v37, 16
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v32, v1, v32
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v36
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38
-; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v38
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38
-; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v38
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38
-; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; SI-NEXT: v_alignbit_b32 v38, v49, v38, 16
-; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v39
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v48, v49, v48, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v39
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49
-; SI-NEXT: v_alignbit_b32 v50, v50, v49, 16
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49
-; SI-NEXT: v_alignbit_b32 v52, v52, v49, 16
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49
-; SI-NEXT: v_alignbit_b32 v54, v54, v49, 16
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49
-; SI-NEXT: v_alignbit_b32 v40, v40, v49, 16
-; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38
+; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v37
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v37, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v39
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v48
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v50
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v51
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v53
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v54
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v38
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v34
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v52
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v42
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v36
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v26, v25, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v26, v25, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v26, v25, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v24, v23, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v24, v23, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v24, v23, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v22, v21, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v22, v21, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v22, v21, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v20, v19, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v20, v19, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v20, v19, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v18, v17, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v18, v17, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v18, v17, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v8, v7, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v8, v7, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v8, v7, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v6, v5, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v6, v5, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v6, v5, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v4, v3, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v4, v3, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v4, v3, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v2, v35, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v2, v35, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v2, v35, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v28
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v20
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20
-; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v41
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v55
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v12
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v53
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v6
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v51
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v4
+; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v40
+; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v55
+; SI-NEXT: v_mov_b32_e32 v39, v41
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: .LBB90_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v38
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v31
+; SI-NEXT: v_or_b32_e32 v1, v1, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xff, v31
+; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v33, v31
+; SI-NEXT: v_or_b32_e32 v1, v1, v31
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v40
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v32
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v31
+; SI-NEXT: v_or_b32_e32 v1, v1, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xff, v31
+; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v31, v32, v31
+; SI-NEXT: v_or_b32_e32 v1, v1, v31
+; SI-NEXT: v_add_i32_e32 v31, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v35
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29
+; SI-NEXT: v_or_b32_e32 v1, v1, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v31
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v29, 0xff, v29
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v31, v29
+; SI-NEXT: v_or_b32_e32 v1, v1, v29
+; SI-NEXT: v_add_i32_e32 v29, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v54
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v30
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29
+; SI-NEXT: v_or_b32_e32 v1, v1, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v30
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v29, 0xff, v29
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v29, v30, v29
+; SI-NEXT: v_or_b32_e32 v1, v1, v29
+; SI-NEXT: v_add_i32_e32 v29, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v32
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27
+; SI-NEXT: v_or_b32_e32 v1, v1, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v29
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v27, 0xff, v27
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v29, v27
+; SI-NEXT: v_or_b32_e32 v1, v1, v27
+; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0
+; SI-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v52
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27
+; SI-NEXT: v_or_b32_e32 v1, v1, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v27, 0xff, v27
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v27, v28, v27
+; SI-NEXT: v_or_b32_e32 v1, v1, v27
+; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v29
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25
+; SI-NEXT: v_or_b32_e32 v1, v1, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v25, 0xff, v25
+; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v27, v25
+; SI-NEXT: v_or_b32_e32 v1, v1, v25
+; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v50
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25
+; SI-NEXT: v_or_b32_e32 v1, v1, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v25, 0xff, v25
+; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_or_b32_e32 v25, v26, v25
+; SI-NEXT: v_or_b32_e32 v1, v1, v25
+; SI-NEXT: v_add_i32_e32 v25, vcc, 28, v0
+; SI-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v26
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23
+; SI-NEXT: v_or_b32_e32 v1, v1, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v23, 0xff, v23
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v25, v23
+; SI-NEXT: v_or_b32_e32 v1, v1, v23
+; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v48
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23
+; SI-NEXT: v_or_b32_e32 v1, v1, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v23, 0xff, v23
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v24, v23
+; SI-NEXT: v_or_b32_e32 v1, v1, v23
+; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0
+; SI-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v23
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; SI-NEXT: v_or_b32_e32 v1, v1, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v21, 0xff, v21
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v23, v21
+; SI-NEXT: v_or_b32_e32 v1, v1, v21
+; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0
+; SI-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v37
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; SI-NEXT: v_or_b32_e32 v1, v1, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v21, 0xff, v21
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v22, v21
+; SI-NEXT: v_or_b32_e32 v1, v1, v21
+; SI-NEXT: v_add_i32_e32 v21, vcc, 44, v0
+; SI-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v18
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; SI-NEXT: v_or_b32_e32 v1, v1, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v19, 0xff, v19
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v21, v19
+; SI-NEXT: v_or_b32_e32 v1, v1, v19
+; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0
+; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v34
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; SI-NEXT: v_or_b32_e32 v1, v1, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v19, 0xff, v19
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v20, v19
+; SI-NEXT: v_or_b32_e32 v1, v1, v19
+; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0
+; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v15
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; SI-NEXT: v_or_b32_e32 v1, v1, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v19, v17
+; SI-NEXT: v_or_b32_e32 v1, v1, v17
+; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0
+; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v31
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; SI-NEXT: v_or_b32_e32 v1, v1, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v18, v17
+; SI-NEXT: v_or_b32_e32 v1, v1, v17
+; SI-NEXT: v_add_i32_e32 v17, vcc, 60, v0
+; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v13
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15
+; SI-NEXT: v_or_b32_e32 v1, v1, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v17, v15
+; SI-NEXT: v_or_b32_e32 v1, v1, v15
+; SI-NEXT: v_add_i32_e32 v15, vcc, 64, v0
+; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v28
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15
+; SI-NEXT: v_or_b32_e32 v1, v1, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v12, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v16, v15
+; SI-NEXT: v_or_b32_e32 v1, v1, v15
+; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0
+; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; SI-NEXT: v_or_b32_e32 v1, v1, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v9, v11, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v15, v13
+; SI-NEXT: v_or_b32_e32 v1, v1, v13
+; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0
+; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v25
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; SI-NEXT: v_or_b32_e32 v1, v1, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xff, v39
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v13, v14, v13
+; SI-NEXT: v_or_b32_e32 v1, v1, v13
+; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0
+; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; SI-NEXT: v_or_b32_e32 v1, v1, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v13, v11
+; SI-NEXT: v_or_b32_e32 v1, v1, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 0x50, v0
+; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; SI-NEXT: v_or_b32_e32 v1, v1, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v37
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v11, v12, v11
+; SI-NEXT: v_or_b32_e32 v1, v1, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 0x54, v0
+; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_or_b32_e32 v1, v1, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_or_b32_e32 v9, v11, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v1, v1, v9
+; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0
+; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; SI-NEXT: v_or_b32_e32 v6, v6, v8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
+; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; SI-NEXT: v_or_b32_e32 v1, v1, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v63
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_or_b32_e32 v6, v6, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0
-; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v9, v10, v9
+; SI-NEXT: v_or_b32_e32 v1, v1, v9
+; SI-NEXT: v_add_i32_e32 v9, vcc, 0x5c, v0
+; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v22
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; SI-NEXT: v_or_b32_e32 v6, v6, v8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; SI-NEXT: v_or_b32_e32 v1, v1, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_or_b32_e32 v6, v6, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0
-; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v8, v6
-; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0
-; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v9, v7
+; SI-NEXT: v_or_b32_e32 v1, v1, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 0x60, v0
+; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v20
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; SI-NEXT: v_or_b32_e32 v1, v1, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v8, v6
-; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0
-; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v8, v7
+; SI-NEXT: v_or_b32_e32 v1, v1, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0
+; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_or_b32_e32 v1, v1, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0
-; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v5, v7, v5
+; SI-NEXT: v_or_b32_e32 v1, v1, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0
+; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v17
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v47
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_or_b32_e32 v1, v1, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0
-; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v1, v1, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0
+; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v5, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v10
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v35
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v4, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v7
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -156968,1187 +158106,1285 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr59
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr41
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: ; implicit-def: $vgpr53
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: ; implicit-def: $vgpr63
-; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr52
+; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr32
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr36
; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: ; implicit-def: $vgpr52
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
-; VI-NEXT: ; kill: killed $vgpr33
-; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: ; kill: killed $vgpr31
+; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB90_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10]
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v11
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v10
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v7
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v4
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v4
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v3
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v3
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v2
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v2
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v1
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v58
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v58
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v57
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v57
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v30
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v30
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v28
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v27
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v26
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v23
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v26
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v22
+; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v19
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v25
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v21
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v18
+; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[15:16]
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v20
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v17
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[13:14]
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[9:10]
+; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[3:4]
+; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v26
+; VI-NEXT: v_lshrrev_b64 v[53:54], 24, v[1:2]
+; VI-NEXT: v_mov_b32_e32 v54, v50
+; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[57:58]
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[29:30]
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[27:28]
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8]
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[25:26]
+; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22]
+; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16
+; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v24
+; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[5:6]
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v40, v46
+; VI-NEXT: v_mov_b32_e32 v46, v45
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20]
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2]
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32]
-; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v45, v46
-; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30]
-; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11
-; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v46, v63
-; VI-NEXT: v_mov_b32_e32 v63, v50
-; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28]
-; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10
-; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9
-; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v8
-; VI-NEXT: v_mov_b32_e32 v51, v57
-; VI-NEXT: v_mov_b32_e32 v50, v56
-; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26]
-; VI-NEXT: v_mov_b32_e32 v57, v43
-; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18]
-; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12
-; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v9
-; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8
-; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32
-; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32
-; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32
-; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31
-; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30
-; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30
-; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; VI-NEXT: v_mov_b32_e32 v47, v34
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[17:18]
+; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v15
+; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v14
+; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v14
+; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v14
+; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v58
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v23
+; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v22
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21
+; VI-NEXT: v_lshrrev_b32_e32 v38, 24, v20
+; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v20
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v18
+; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v18
+; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
+; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v16
; VI-NEXT: .LBB90_2: ; %Flow
-; VI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT: s_xor_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB90_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34
-; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v18
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32
+; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v18, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v18, 16, 1
; VI-NEXT: s_movk_i32 s6, 0x7fff
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v18
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v18
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v18
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT: v_cndmask_b32_e32 v18, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v18, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; VI-NEXT: v_alignbit_b32 v18, v18, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v34, v18, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v17
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v17, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v17, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v17
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v17, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v33, v17, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v20
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v20, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v20
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v20
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v20, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v20
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v20
; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
-; VI-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v20, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v20, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v34, v20, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v19
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v19, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v19
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v19
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v19, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v19
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v19
; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v19, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v19, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v33, v19, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v22
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v22, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v22
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v22
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v22, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v22
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v22
; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
-; VI-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v22, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v22, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v34, v22, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v21
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v21, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v21
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v21
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v21, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v21
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v21
; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v21, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v21, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v33, v21, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v24
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v24, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v24
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v24
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v24, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v24
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v24
; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
-; VI-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v24, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v24, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v34, v24, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v23
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v23, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v23
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v23
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v23, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v23
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v23
; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v23, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v23, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v33, v23, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v26
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v26, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v26
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v26
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v26, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v26
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v26
; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
-; VI-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v26, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v26, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v34, v26, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v25
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v25, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v25
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v25
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v25, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v25
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v25
; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v25, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v25, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v33, v25, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v28
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v28, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v28
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v28
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v28, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v28
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v28
; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
-; VI-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v28, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v28, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v34, v28, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v27
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v27, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v27
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v27, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v27
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v27
; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v27, v32, v33, vcc
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v27, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v33, v27, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v30, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v30
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v30
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v30, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v34, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30
; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
-; VI-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v30, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v29
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v29, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v29, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v35, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v29
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29
; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc
+; VI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34
+; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v29, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v32
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
+; VI-NEXT: v_or_b32_e32 v32, v30, v33
+; VI-NEXT: v_or_b32_e32 v31, v29, v31
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v58
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v58
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_cndmask_b32_e32 v35, v32, v33, vcc
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v57
; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v32, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v32
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v32
+; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
+; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
+; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
+; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
-; VI-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT: v_alignbit_b32 v32, v32, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v57
+; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; VI-NEXT: v_cndmask_b32_e32 v36, v33, v34, vcc
+; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
+; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
+; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33
+; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
+; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v32
+; VI-NEXT: v_or_b32_e32 v49, v58, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36
+; VI-NEXT: v_or_b32_e32 v48, v57, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v31, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v31
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v31
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
-; VI-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v2
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v2
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v2, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v34, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v2
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_cndmask_b32_e32 v2, v31, v32, vcc
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v1
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v1
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v1, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v1
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v31, v32, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v51, v2, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33
+; VI-NEXT: v_or_b32_e32 v50, v1, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v4, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v4
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v4
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v4, 16, 1
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_cndmask_b32_e32 v34, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v4
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_cndmask_b32_e32 v4, v31, v32, vcc
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v3
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v3
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v3
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v3, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v3
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v31, v32, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v53, v4, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33
+; VI-NEXT: v_or_b32_e32 v52, v3, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v6
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v6, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v6
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v6
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v6, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v38, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v6
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_cndmask_b32_e32 v6, v31, v32, vcc
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v5
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v5
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v5
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v5, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v5
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v5, v31, v32, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v38
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v43, v6, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33
+; VI-NEXT: v_or_b32_e32 v42, v5, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v8, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v8
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v8
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v8, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v41, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v8
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_cndmask_b32_e32 v8, v31, v32, vcc
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v7
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v7
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v7
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v7, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v7
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v7, v31, v32, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v41
; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v40, v8, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33
+; VI-NEXT: v_or_b32_e32 v39, v7, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v10, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v10
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v10
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v10, 16, 1
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_cndmask_b32_e32 v35, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v10
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; VI-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_cndmask_b32_e32 v10, v31, v32, vcc
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v9
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v9, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v9
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v9
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v9, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v62, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v9
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v9, v31, v32, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v55, v10, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62
+; VI-NEXT: v_or_b32_e32 v54, v9, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v12
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v12, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v12
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v12
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v12, 16, 1
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: v_cndmask_b32_e32 v34, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v12
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; VI-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_cndmask_b32_e32 v12, v31, v32, vcc
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v11
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v11, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v11
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v11
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v11, 16, 1
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: v_cndmask_b32_e32 v36, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v11
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v11, v31, v32, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34
; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v45, v12, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36
+; VI-NEXT: v_or_b32_e32 v44, v11, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v14, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v14
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14
+; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v14, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; VI-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v13
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v13
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v13
+; VI-NEXT: v_or_b32_e32 v46, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; VI-NEXT: v_bfe_u32 v31, v13, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v63, v32, v46, vcc
+; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v13
+; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31
+; VI-NEXT: v_or_b32_e32 v32, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33
; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_or_b32_e32 v61, v14, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63
+; VI-NEXT: v_or_b32_e32 v60, v13, v31
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v16
+; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; VI-NEXT: v_bfe_u32 v32, v31, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; VI-NEXT: v_or_b32_e32 v46, 0x400000, v31
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v16
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v16
+; VI-NEXT: v_cndmask_b32_e32 v31, v32, v46, vcc
+; VI-NEXT: v_bfe_u32 v32, v16, 16, 1
+; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v16
+; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32
+; VI-NEXT: v_or_b32_e32 v46, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v16, v16, v33, 16
-; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v15
-; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; VI-NEXT: v_cndmask_b32_e32 v16, v32, v46, vcc
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; VI-NEXT: v_bfe_u32 v46, v32, 16, 1
+; VI-NEXT: v_add_u32_e32 v46, vcc, v46, v32
+; VI-NEXT: v_add_u32_e32 v46, vcc, s6, v46
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; VI-NEXT: v_or_b32_e32 v47, 0x400000, v32
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v15, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v15
-; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v15
+; VI-NEXT: v_cndmask_b32_e32 v32, v46, v47, vcc
+; VI-NEXT: v_bfe_u32 v46, v15, 16, 1
+; VI-NEXT: v_add_u32_e32 v46, vcc, v46, v15
+; VI-NEXT: v_add_u32_e32 v46, vcc, s6, v46
+; VI-NEXT: v_or_b32_e32 v47, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT: v_cndmask_b32_e32 v15, v34, v35, vcc
+; VI-NEXT: v_cndmask_b32_e32 v15, v46, v47, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; VI-NEXT: v_and_b32_e32 v46, 0xffff0000, v31
; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v33, 16
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18]
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v18
-; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v17
-; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v12
-; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12
-; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v11
-; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10
-; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v9
-; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v8
-; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8
-; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32
-; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32
-; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32
-; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31
-; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v31
-; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30
-; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30
-; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v17
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT: .LBB90_4: ; %end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_e32 v47, v16, v46
+; VI-NEXT: v_and_b32_e32 v46, 0xffff0000, v32
+; VI-NEXT: v_or_b32_e32 v46, v15, v46
+; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v47
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v46
+; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[46:47]
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v60
+; VI-NEXT: v_lshrrev_b64 v[59:60], 24, v[60:61]
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v45
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v44
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[44:45]
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v55
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v54
+; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[54:55]
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v40
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v39
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v43
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v42
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v53
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v52
+; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53]
+; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v51
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v50
+; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[53:54], 24, v[50:51]
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v49
+; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v48
+; VI-NEXT: v_lshrrev_b64 v[48:49], 24, v[48:49]
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[42:43]
+; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v31
+; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v61
+; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v32
+; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v63
+; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v33
+; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v33
+; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[39:40]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v49
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[48:49], 24, v[49:50]
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v49
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[48:49], 24, v[49:50]
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v49
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b64 v[48:49], 24, v[49:50]
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v50
+; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v49
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[49:50]
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v51
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50
+; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51]
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v51
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50
+; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51]
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[50:51]
+; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v31
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v34
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v36
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v35
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v35
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v62
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v41
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v41
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v51
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v38
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v38
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v50
+; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v50
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v50
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v44
-; VI-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v32
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v32
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v32
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v32
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v43
-; VI-NEXT: v_or_b32_sdwa v2, v2, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v32
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v43
+; VI-NEXT: v_lshrrev_b32_e32 v38, 24, v32
+; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v32
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v31
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v43, v44, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; VI-NEXT: .LBB90_4: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v50
+; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v50
+; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v53
+; VI-NEXT: v_or_b32_sdwa v33, v33, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -158159,242 +159395,231 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46
-; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45
-; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
+; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v47
+; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
+; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46
+; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44
+; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v48
+; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38
+; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v42
+; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49
+; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40
+; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -158402,36 +159627,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54
+; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -158442,39 +159664,57 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34
-; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63
-; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37
-; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60
+; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
@@ -162540,484 +163780,613 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v46, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v61, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v63, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v62, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v19
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v30
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v13, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v63, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v7, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v9, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v13, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v12, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v61, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v59, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v62, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v44, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v30
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v11, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v17, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v16, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v15, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v21, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v20, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v19, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v18, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v23, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v22, 1.0, s29
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v32
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v32
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v33
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34
-; SI-NEXT: v_mul_f32_e32 v14, 1.0, v35
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35
; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38
-; SI-NEXT: v_mul_f32_e32 v15, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v37
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v38
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39
; SI-NEXT: v_mul_f32_e32 v38, 1.0, v48
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49
; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v50
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v50
; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v51
+; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54
; SI-NEXT: v_mul_f32_e32 v50, 1.0, v52
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v53
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v54
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v55
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v40
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v41
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v42
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43
-; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23
-; SI-NEXT: v_mul_f32_e64 v49, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v16, 1.0, s29
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v42, 1.0, v43
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v52, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v54, 1.0, s23
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB91_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mov_b32_e32 v43, v36
-; SI-NEXT: v_alignbit_b32 v36, v1, v2, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: v_alignbit_b32 v6, v1, v6, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT: v_alignbit_b32 v2, v1, v13, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16
-; SI-NEXT: v_alignbit_b32 v5, v1, v17, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_alignbit_b32 v4, v1, v3, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44
-; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47
-; SI-NEXT: v_alignbit_b32 v16, v1, v57, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v13, v1, v58, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62
-; SI-NEXT: v_alignbit_b32 v10, v1, v60, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v43, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52
+; SI-NEXT: v_or_b32_e32 v32, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v17, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
+; SI-NEXT: v_mov_b32_e32 v11, v30
+; SI-NEXT: v_or_b32_e32 v30, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v15, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11
-; SI-NEXT: v_alignbit_b32 v44, v19, v8, 16
-; SI-NEXT: v_alignbit_b32 v7, v1, v22, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
-; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24
-; SI-NEXT: v_alignbit_b32 v60, v1, v27, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39
-; SI-NEXT: v_alignbit_b32 v57, v1, v30, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8
-; SI-NEXT: v_alignbit_b32 v58, v22, v9, 16
-; SI-NEXT: v_alignbit_b32 v40, v1, v37, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v49
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8
-; SI-NEXT: v_alignbit_b32 v47, v25, v12, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v21, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_mov_b32_e32 v19, v63
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19
+; SI-NEXT: v_mov_b32_e32 v23, v26
+; SI-NEXT: v_or_b32_e32 v26, v1, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v46
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v45, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v7, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v22, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v56, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; SI-NEXT: v_mov_b32_e32 v47, v18
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41
+; SI-NEXT: v_or_b32_e32 v20, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13
+; SI-NEXT: v_or_b32_e32 v13, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63
+; SI-NEXT: v_or_b32_e32 v18, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60
+; SI-NEXT: v_or_b32_e32 v9, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59
+; SI-NEXT: v_mov_b32_e32 v60, v3
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v46
+; SI-NEXT: v_or_b32_e32 v16, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57
+; SI-NEXT: v_or_b32_e32 v57, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v14, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v62, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25
+; SI-NEXT: v_or_b32_e32 v12, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29
+; SI-NEXT: v_or_b32_e32 v44, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v10, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36
+; SI-NEXT: v_or_b32_e32 v59, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39
+; SI-NEXT: v_or_b32_e32 v8, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v46
-; SI-NEXT: v_alignbit_b32 v53, v1, v48, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49
+; SI-NEXT: v_mov_b32_e32 v49, v5
+; SI-NEXT: v_or_b32_e32 v5, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28
+; SI-NEXT: v_or_b32_e32 v6, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50
-; SI-NEXT: v_alignbit_b32 v50, v8, v59, 16
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT: v_or_b32_e32 v3, v1, v2
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v53, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v27, v21, v15, 24
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT: v_mov_b32_e32 v58, v33
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v42
+; SI-NEXT: v_mov_b32_e32 v48, v23
+; SI-NEXT: v_alignbit_b32 v23, v6, v5, 16
+; SI-NEXT: v_mov_b32_e32 v38, v19
+; SI-NEXT: v_alignbit_b32 v24, v32, v43, 16
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v24, v32, v43, 8
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v24, v30, v17, 24
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v24, v30, v17, 16
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v24, v30, v17, 8
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v24, v21, v15, 16
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v24, v21, v15, 8
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v24, v39
+; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v14
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v12
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v10
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v8
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v6
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v50, v42
+; SI-NEXT: v_alignbit_b32 v37, v8, v59, 24
+; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v16
+; SI-NEXT: v_alignbit_b32 v36, v20, v56, 16
+; SI-NEXT: v_alignbit_b32 v61, v6, v5, 8
+; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v45
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
+; SI-NEXT: v_alignbit_b32 v27, v26, v11, 24
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v27, v26, v11, 16
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v27, v26, v11, 8
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55
+; SI-NEXT: v_or_b32_e32 v2, v2, v33
+; SI-NEXT: v_alignbit_b32 v33, v22, v7, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v52, v1, v52, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
-; SI-NEXT: v_mov_b32_e32 v17, v63
-; SI-NEXT: v_alignbit_b32 v1, v1, v41, 16
-; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31
-; SI-NEXT: v_alignbit_b32 v62, v8, v61, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v22, v7, 16
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v22, v7, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v20, v56, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56
-; SI-NEXT: v_alignbit_b32 v55, v8, v63, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v20, v56, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v18, v13, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v18, v13, 16
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v48, v62, v4, 8
-; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37
-; SI-NEXT: v_alignbit_b32 v38, v8, v45, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v38, v16, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; SI-NEXT: v_alignbit_b32 v35, v8, v18, 16
-; SI-NEXT: v_mov_b32_e32 v45, v8
-; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v29, v35, v13, 8
-; SI-NEXT: v_alignbit_b32 v61, v38, v16, 24
-; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59
-; SI-NEXT: v_alignbit_b32 v30, v8, v21, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v18, v13, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v16, v9, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v16, v9, 16
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v16, v9, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v23
-; SI-NEXT: v_alignbit_b32 v27, v8, v24, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v14, v57, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v14, v57, 16
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v14, v57, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v12, v62, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20
-; SI-NEXT: v_alignbit_b32 v24, v8, v26, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v12, v62, 16
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v12, v62, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v10, v44, 24
+; SI-NEXT: v_alignbit_b32 v29, v2, v1, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32
-; SI-NEXT: v_alignbit_b32 v21, v8, v14, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v10, v44, 16
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v22
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v10, v44, 8
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v20
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v34
-; SI-NEXT: v_alignbit_b32 v18, v8, v15, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v8, v59, 16
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v18, v40, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v18
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v18, v40, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v8, v18, v40, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v8, v59, 8
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v28
-; SI-NEXT: v_alignbit_b32 v63, v8, v51, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v52
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v53, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v6, v5, 24
+; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v53, 16
-; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v33
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v54
+; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v53, 8
-; SI-NEXT: v_alignbit_b32 v12, v40, v43, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v52, v33
+; SI-NEXT: v_mov_b32_e32 v33, v23
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v54
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v47
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v38
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42
-; SI-NEXT: v_mov_b32_e32 v15, v9
-; SI-NEXT: v_alignbit_b32 v9, v8, v54, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v60
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v60
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v49
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_alignbit_b32 v27, v45, v11, 24
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v31, v45, v11, 16
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v27, v45, v11, 8
+; SI-NEXT: v_mov_b32_e32 v23, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v8, v37
-; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v49
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v41
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v46
-; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v56
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v32
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v41
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v34
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v63
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v63
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v28
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v46
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v15
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v46
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v33
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v48
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v59
-; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v20
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v48
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v25
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v27, v4, v3, 24
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v24
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v42
-; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v39
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v32
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v58
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v20, v29
-; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21
-; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v30
+; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v4
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v9
-; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16
-; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v58
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58
-; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v21
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v2
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55
-; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24
+; SI-NEXT: v_alignbit_b32 v35, v4, v3, 8
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v37, v34
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v29, v28
-; SI-NEXT: v_mov_b32_e32 v23, v48
+; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v26
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v47
+; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v49
+; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v24
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v63
-; SI-NEXT: v_mov_b32_e32 v48, v33
-; SI-NEXT: v_mov_b32_e32 v34, v53
-; SI-NEXT: v_mov_b32_e32 v53, v42
-; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v39, v24
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v28
+; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v50
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v50
+; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16
+; SI-NEXT: v_alignbit_b32 v51, v2, v1, 16
+; SI-NEXT: v_alignbit_b32 v40, v2, v1, 8
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v28
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v54, 24, v53
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v38, v49
+; SI-NEXT: v_mov_b32_e32 v49, v37
+; SI-NEXT: v_mov_b32_e32 v58, v35
+; SI-NEXT: v_alignbit_b32 v35, v32, v43, 24
; SI-NEXT: s_branch .LBB91_3
; SI-NEXT: .LBB91_2:
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; kill: killed $vgpr37
+; SI-NEXT: ; kill: killed $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -163155,1143 +164524,1123 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: v_mov_b32_e32 v53, v42
-; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: v_mov_b32_e32 v48, v33
-; SI-NEXT: v_mov_b32_e32 v29, v28
-; SI-NEXT: v_mov_b32_e32 v37, v34
-; SI-NEXT: v_mov_b32_e32 v17, v63
-; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; kill: killed $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; kill: killed $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; kill: killed $vgpr56
; SI-NEXT: .LBB91_3: ; %Flow
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v56, v17
-; SI-NEXT: v_mov_b32_e32 v54, v61
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v42, v32
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v34, v31
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: s_cbranch_vccnz .LBB91_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v24
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39
+; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v36
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v49
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v51
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v29
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v42
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39
+; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v39
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_alignbit_b32 v52, v3, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v23
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v53
+; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v53
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v58, v4, v3, 8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_alignbit_b32 v57, v7, v5, 16
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26
-; SI-NEXT: v_alignbit_b32 v9, v7, v6, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11
-; SI-NEXT: v_alignbit_b32 v12, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v60, v10, v6, 16
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v7
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v61, v6, v5, 8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_alignbit_b32 v7, v13, v7, 16
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v31
-; SI-NEXT: v_alignbit_b32 v63, v13, v10, 16
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v10
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v59, v7, v8
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v44, v9, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v10, v14, v10, 16
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v32
-; SI-NEXT: v_alignbit_b32 v18, v14, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v21, v15, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v62, v11, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v50
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v50
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v31
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v40
; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v11
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v27
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v55
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v13, v16, v13, 16
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v57, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v48
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v15
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v48
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v9, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v16, v19, v16, 16
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v24, v15, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v13, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v18, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v24
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v56, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v42
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v20, v21
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v7, v21, v22
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v27, v15, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v3, v22, v19, 16
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v22
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v54
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v44
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v30, v15, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v22, v22, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT: v_or_b32_e32 v45, v24, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v39
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v45
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v40
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v52
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v55
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT: v_alignbit_b32 v4, v25, v22, 16
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v25
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v40
-; SI-NEXT: v_alignbit_b32 v35, v45, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_alignbit_b32 v5, v28, v25, 16
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v28
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41
-; SI-NEXT: v_alignbit_b32 v38, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v56
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; SI-NEXT: v_alignbit_b32 v2, v33, v28, 16
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v33
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v43
-; SI-NEXT: v_alignbit_b32 v55, v61, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; SI-NEXT: v_alignbit_b32 v6, v36, v33, 16
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36
-; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v36
-; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v11, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v54
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v46
-; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; SI-NEXT: v_alignbit_b32 v62, v15, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; SI-NEXT: v_alignbit_b32 v36, v39, v36, 16
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v53
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v39
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v42
-; SI-NEXT: v_alignbit_b32 v50, v17, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_or_b32_e32 v26, v26, v27
+; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v39
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v56
-; SI-NEXT: v_alignbit_b32 v47, v25, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_or_b32_e32 v15, v27, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39
-; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50
-; SI-NEXT: v_alignbit_b32 v58, v22, v14, 16
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: v_or_b32_e32 v21, v28, v29
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v56
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v25, v21, v15, 24
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v42
-; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v63
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v25, v26, v11, 24
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46
-; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v25, v26, v11, 16
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v43
-; SI-NEXT: v_alignbit_b32 v43, v38, v16, 8
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v25, v26, v11, 8
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v41
-; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v25, v45, v23, 24
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v40
-; SI-NEXT: v_mov_b32_e32 v40, v8
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v54
-; SI-NEXT: v_alignbit_b32 v54, v38, v16, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v25, v45, v23, 8
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v20, v35, v13, 8
+; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v18
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v16
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23
-; SI-NEXT: v_alignbit_b32 v23, v62, v4, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v14
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; SI-NEXT: v_or_b32_e32 v17, v29, v30
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_or_b32_e32 v30, v30, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; SI-NEXT: v_or_b32_e32 v43, v31, v32
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_or_b32_e32 v32, v32, v41
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v39
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v49
-; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v40
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v32
-; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v52
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v31
-; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v55
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v11
-; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v54
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v26
-; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v44, v19, v14, 16
-; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v59
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v51
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v50
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v49
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v48
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v38
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v38
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v34
+; SI-NEXT: v_lshrrev_b32_e32 v54, 24, v34
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v33
+; SI-NEXT: v_alignbit_b32 v33, v22, v7, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v22, v7, 16
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v22, v7, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v20, v56, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v20, v56, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v18, v13, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v18, v13, 16
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v18, v13, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v16, v9, 24
+; SI-NEXT: v_alignbit_b32 v24, v32, v43, 16
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v16, v9, 16
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v24, v30, v17, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v16, v9, 8
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v24, v30, v17, 16
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v14, v57, 24
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v24, v30, v17, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v14, v57, 16
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v24, v21, v15, 16
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v14, v57, 8
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v24, v21, v15, 8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v12, v62, 24
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v37
+; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v37
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v12, v62, 16
+; SI-NEXT: v_alignbit_b32 v24, v4, v3, 24
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v12, v62, 8
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v24, v2, v1, 24
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v10, v44, 24
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v18, v51, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v32
+; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v12
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v18, v51, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v8, v18, v51, 8
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v10, v44, 16
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v12
-; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v30
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v10
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v34, 24
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v10, v44, 8
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v9
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v21
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v34, 16
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v8
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v63, v34, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v8, v59, 16
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v26
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v6
+; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v51
+; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v49
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v8, v59, 8
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v22
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4
+; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v50, 24, v50
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v48
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v35, v32, v43, 24
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_alignbit_b32 v41, v32, v43, 8
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v34, v45, v23, 16
+; SI-NEXT: v_alignbit_b32 v36, v20, v56, 16
+; SI-NEXT: v_alignbit_b32 v49, v8, v59, 24
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v52, v6, v5, 24
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16
+; SI-NEXT: v_alignbit_b32 v51, v2, v1, 16
+; SI-NEXT: v_alignbit_b32 v40, v2, v1, 8
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v20
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27
+; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v2
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: .LBB91_5: ; %end
-; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v36, 0xff, v36
-; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
-; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v31, 0xff, v43
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v41
+; SI-NEXT: v_or_b32_e32 v31, v31, v46
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v35
+; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; SI-NEXT: v_and_b32_e32 v29, 0xff, v17
+; SI-NEXT: v_and_b32_e32 v27, 0xff, v15
+; SI-NEXT: v_and_b32_e32 v25, 0xff, v11
+; SI-NEXT: v_and_b32_e32 v23, 0xff, v23
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v28
-; SI-NEXT: v_or_b32_e32 v32, v36, v32
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v36, 0xff, v29
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36
-; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v29
-; SI-NEXT: v_or_b32_e32 v36, v56, v36
-; SI-NEXT: v_or_b32_e32 v32, v32, v36
-; SI-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v48, 0xff, v24
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; SI-NEXT: v_or_b32_e32 v37, v37, v48
+; SI-NEXT: v_or_b32_e32 v31, v31, v37
+; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v32, 0xff, v44
-; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v51
-; SI-NEXT: v_or_b32_e32 v32, v32, v36
-; SI-NEXT: v_and_b32_e32 v36, 0xff, v19
-; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36
-; SI-NEXT: v_or_b32_e32 v14, v14, v36
-; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; SI-NEXT: v_or_b32_e32 v14, v32, v14
+; SI-NEXT: v_and_b32_e32 v31, 0xff, v32
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v24
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v32, 0xff, v32
+; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v24
+; SI-NEXT: v_or_b32_e32 v32, v35, v32
+; SI-NEXT: v_or_b32_e32 v31, v31, v32
; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v19
+; SI-NEXT: v_and_b32_e32 v19, 0xff, v56
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v29, v29, v31
+; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v31, 0xff, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v17
+; SI-NEXT: v_or_b32_e32 v31, v32, v31
+; SI-NEXT: v_or_b32_e32 v29, v29, v31
+; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v6
+; SI-NEXT: v_and_b32_e32 v29, 0xff, v30
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v19
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v14, v14, v32
-; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v17
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v30, 0xff, v30
+; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v32, 0xff, v19
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v17
+; SI-NEXT: v_or_b32_e32 v30, v31, v30
+; SI-NEXT: v_or_b32_e32 v29, v29, v30
+; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xff, v57
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v27, v27, v29
+; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v19
-; SI-NEXT: v_or_b32_e32 v32, v33, v32
-; SI-NEXT: v_or_b32_e32 v14, v14, v32
-; SI-NEXT: v_add_i32_e32 v32, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v58
-; SI-NEXT: v_or_b32_e32 v11, v14, v11
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v22
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v29, 0xff, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v6
-; SI-NEXT: v_or_b32_e32 v14, v32, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v15
+; SI-NEXT: v_or_b32_e32 v29, v30, v29
+; SI-NEXT: v_or_b32_e32 v27, v27, v29
+; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0
+; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v2
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v60
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v27, 0xff, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xff, v7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v15
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v28, 0xff, v28
+; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v15
+; SI-NEXT: v_or_b32_e32 v28, v29, v28
+; SI-NEXT: v_or_b32_e32 v27, v27, v28
+; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v44
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v25, v25, v27
+; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v27, 0xff, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v11
+; SI-NEXT: v_or_b32_e32 v27, v28, v27
+; SI-NEXT: v_or_b32_e32 v25, v25, v27
+; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v25, 0xff, v26
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v11
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v19
+; SI-NEXT: v_and_b32_e32 v26, 0xff, v26
+; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_or_b32_e32 v14, v28, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v11
+; SI-NEXT: v_or_b32_e32 v26, v27, v26
+; SI-NEXT: v_or_b32_e32 v25, v25, v26
+; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0
+; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v23, v23, v25
+; SI-NEXT: v_and_b32_e32 v25, 0xff, v34
+; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v11
+; SI-NEXT: v_or_b32_e32 v25, v26, v25
+; SI-NEXT: v_or_b32_e32 v23, v23, v25
+; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v47
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v49
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v25
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v23, 0xff, v45
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
+; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v2
-; SI-NEXT: v_or_b32_e32 v14, v28, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v11
+; SI-NEXT: v_or_b32_e32 v24, v25, v24
+; SI-NEXT: v_or_b32_e32 v23, v23, v24
+; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0
+; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v57
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23
+; SI-NEXT: v_or_b32_e32 v21, v21, v23
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v19
+; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_or_b32_e32 v14, v25, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v23, 0xff, v23
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v24, v23
+; SI-NEXT: v_or_b32_e32 v21, v21, v23
+; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0
+; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v50
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v39
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v17
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v21, 0xff, v22
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v2
-; SI-NEXT: v_or_b32_e32 v14, v25, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: v_and_b32_e32 v22, 0xff, v60
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v11
+; SI-NEXT: v_or_b32_e32 v22, v23, v22
+; SI-NEXT: v_or_b32_e32 v21, v21, v22
+; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0
+; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v4
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v23
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; SI-NEXT: v_or_b32_e32 v19, v19, v21
+; SI-NEXT: v_and_b32_e32 v21, 0xff, v36
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17
-; SI-NEXT: v_or_b32_e32 v14, v22, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22
+; SI-NEXT: v_or_b32_e32 v21, v22, v21
+; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; SI-NEXT: v_or_b32_e32 v19, v19, v21
+; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0
+; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v62
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v31
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v19, 0xff, v20
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v11
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17
-; SI-NEXT: v_or_b32_e32 v14, v22, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v20, 0xff, v20
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_or_b32_e32 v20, v21, v20
+; SI-NEXT: v_or_b32_e32 v19, v19, v20
+; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0
+; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v34
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; SI-NEXT: v_or_b32_e32 v17, v17, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17
+; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_or_b32_e32 v14, v19, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v19, 0xff, v19
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v20, v19
+; SI-NEXT: v_or_b32_e32 v17, v17, v19
+; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0
+; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v55
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v46
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v61
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v18
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v59
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17
-; SI-NEXT: v_or_b32_e32 v14, v19, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v19, v18
+; SI-NEXT: v_or_b32_e32 v17, v17, v18
+; SI-NEXT: v_add_i32_e32 v18, vcc, 60, v0
+; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v16
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v43
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v41
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v54
-; SI-NEXT: v_or_b32_e32 v14, v16, v14
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; SI-NEXT: v_or_b32_e32 v15, v15, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v18, v17
+; SI-NEXT: v_or_b32_e32 v15, v15, v17
+; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0
+; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v38
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v16
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v2
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v11
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v17, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0
+; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15
+; SI-NEXT: v_or_b32_e32 v13, v13, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v16, v15
+; SI-NEXT: v_or_b32_e32 v13, v13, v15
+; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0
+; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v13, 0xff, v14
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v11
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v62
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_or_b32_e32 v14, v16, v14
-; SI-NEXT: v_or_b32_e32 v11, v11, v14
-; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0
-; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v14, v15, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0
+; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v13
-; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v20
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
; SI-NEXT: v_or_b32_e32 v11, v11, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
; SI-NEXT: v_or_b32_e32 v13, v14, v13
; SI-NEXT: v_or_b32_e32 v11, v11, v13
-; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0
+; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v35
-; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v15
-; SI-NEXT: v_or_b32_e32 v11, v11, v13
-; SI-NEXT: v_and_b32_e32 v13, 0xff, v45
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
-; SI-NEXT: v_or_b32_e32 v13, v14, v13
-; SI-NEXT: v_or_b32_e32 v11, v11, v13
-; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0
-; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT: v_or_b32_e32 v10, v10, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_or_b32_e32 v11, v13, v11
-; SI-NEXT: v_or_b32_e32 v10, v10, v11
-; SI-NEXT: v_add_i32_e32 v11, vcc, 64, v0
-; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v13, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_add_i32_e32 v12, vcc, 0x54, v0
+; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v30
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2
-; SI-NEXT: v_or_b32_e32 v10, v10, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; SI-NEXT: v_or_b32_e32 v9, v9, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13
+; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_or_b32_e32 v11, v13, v11
-; SI-NEXT: v_or_b32_e32 v10, v10, v11
-; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0
-; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v11, v12, v11
+; SI-NEXT: v_or_b32_e32 v9, v9, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 0x58, v0
+; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10
-; SI-NEXT: v_or_b32_e32 v7, v7, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_or_b32_e32 v7, v7, v10
-; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0
-; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_add_i32_e32 v10, vcc, 0x5c, v0
+; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v49
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; SI-NEXT: v_or_b32_e32 v7, v7, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v10, v9
+; SI-NEXT: v_or_b32_e32 v7, v7, v9
+; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0
+; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v27
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v63
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10
-; SI-NEXT: v_or_b32_e32 v8, v10, v8
+; SI-NEXT: v_or_b32_e32 v8, v9, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0
+; SI-NEXT: v_add_i32_e32 v8, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT: v_or_b32_e32 v6, v6, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v26
+; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v61
+; SI-NEXT: v_or_b32_e32 v5, v5, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v33
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v52
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_or_b32_e32 v6, v6, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
-; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v5, v5, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0
+; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v24
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT: v_or_b32_e32 v6, v6, v7
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v47
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_or_b32_e32 v6, v6, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
-; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0
+; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v21
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v58
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v42
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0
-; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v52
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0
-; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v18
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v54
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: v_or_b32_e32 v4, v4, v5
-; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0
-; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v55
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0
-; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v63
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v42
-; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; SI-NEXT: v_or_b32_e32 v4, v5, v4
; SI-NEXT: v_or_b32_e32 v3, v3, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v12
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v40
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v40
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v51
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v9
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -164322,8 +165671,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: v_writelane_b32 v63, s30, 0
; VI-NEXT: v_writelane_b32 v63, s31, 1
@@ -164358,24 +165707,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_writelane_b32 v63, s86, 30
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
; VI-NEXT: v_writelane_b32 v63, s87, 31
-; VI-NEXT: v_readfirstlane_b32 s44, v3
-; VI-NEXT: v_readfirstlane_b32 s45, v4
-; VI-NEXT: v_readfirstlane_b32 s42, v5
-; VI-NEXT: v_readfirstlane_b32 s43, v6
-; VI-NEXT: v_readfirstlane_b32 s40, v7
-; VI-NEXT: v_readfirstlane_b32 s41, v8
-; VI-NEXT: v_readfirstlane_b32 s14, v9
-; VI-NEXT: v_readfirstlane_b32 s15, v10
-; VI-NEXT: v_readfirstlane_b32 s12, v11
-; VI-NEXT: v_readfirstlane_b32 s13, v12
-; VI-NEXT: v_readfirstlane_b32 s10, v13
-; VI-NEXT: v_readfirstlane_b32 s11, v14
-; VI-NEXT: v_readfirstlane_b32 s8, v15
-; VI-NEXT: v_readfirstlane_b32 s9, v16
-; VI-NEXT: v_readfirstlane_b32 s6, v17
-; VI-NEXT: v_readfirstlane_b32 s7, v18
+; VI-NEXT: v_readfirstlane_b32 s76, v3
+; VI-NEXT: v_readfirstlane_b32 s77, v4
+; VI-NEXT: v_readfirstlane_b32 s74, v5
+; VI-NEXT: v_readfirstlane_b32 s75, v6
+; VI-NEXT: v_readfirstlane_b32 s72, v7
+; VI-NEXT: v_readfirstlane_b32 s73, v8
+; VI-NEXT: v_readfirstlane_b32 s62, v9
+; VI-NEXT: v_readfirstlane_b32 s63, v10
+; VI-NEXT: v_readfirstlane_b32 s60, v11
+; VI-NEXT: v_readfirstlane_b32 s61, v12
+; VI-NEXT: v_readfirstlane_b32 s58, v13
+; VI-NEXT: v_readfirstlane_b32 s59, v14
+; VI-NEXT: v_readfirstlane_b32 s56, v15
+; VI-NEXT: v_readfirstlane_b32 s57, v16
+; VI-NEXT: v_readfirstlane_b32 s44, v17
+; VI-NEXT: v_readfirstlane_b32 s45, v18
; VI-NEXT: v_readfirstlane_b32 s4, v1
-; VI-NEXT: s_and_b64 s[46:47], vcc, exec
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
@@ -164394,954 +165743,1003 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; VI-NEXT: s_cbranch_scc0 .LBB91_3
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s46, s5, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 57
-; VI-NEXT: s_lshr_b32 s46, s5, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 56
-; VI-NEXT: s_lshr_b32 s46, s5, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 55
-; VI-NEXT: s_lshr_b32 s46, s4, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 54
-; VI-NEXT: s_lshr_b32 s46, s4, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 53
-; VI-NEXT: s_lshr_b32 s46, s29, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 52
-; VI-NEXT: s_lshr_b32 s46, s29, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 51
-; VI-NEXT: s_lshr_b32 s46, s29, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 50
-; VI-NEXT: s_lshr_b32 s46, s28, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 49
-; VI-NEXT: s_lshr_b32 s46, s28, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 48
-; VI-NEXT: s_lshr_b32 s46, s27, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 47
-; VI-NEXT: s_lshr_b32 s46, s27, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 46
-; VI-NEXT: s_lshr_b32 s46, s27, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 45
-; VI-NEXT: s_lshr_b32 s46, s26, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 44
-; VI-NEXT: s_lshr_b32 s46, s26, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 43
-; VI-NEXT: s_lshr_b32 s46, s25, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 42
-; VI-NEXT: s_lshr_b32 s46, s25, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 41
-; VI-NEXT: s_lshr_b32 s46, s25, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 40
-; VI-NEXT: s_lshr_b32 s46, s24, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 39
-; VI-NEXT: s_lshr_b32 s46, s24, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 38
-; VI-NEXT: s_lshr_b32 s46, s23, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 37
-; VI-NEXT: s_lshr_b32 s46, s23, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 36
-; VI-NEXT: s_lshr_b32 s46, s23, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 35
-; VI-NEXT: s_lshr_b32 s46, s22, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 34
-; VI-NEXT: s_lshr_b32 s46, s22, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 33
-; VI-NEXT: s_lshr_b32 s46, s21, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 32
-; VI-NEXT: s_lshr_b32 s46, s21, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 31
-; VI-NEXT: s_lshr_b32 s46, s21, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 30
-; VI-NEXT: s_lshr_b32 s46, s20, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 29
-; VI-NEXT: s_lshr_b32 s46, s20, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 28
-; VI-NEXT: s_lshr_b32 s46, s19, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 27
-; VI-NEXT: s_lshr_b32 s46, s19, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 26
-; VI-NEXT: s_lshr_b32 s46, s19, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 25
-; VI-NEXT: s_lshr_b32 s46, s18, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 24
-; VI-NEXT: s_lshr_b32 s46, s18, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 23
-; VI-NEXT: s_lshr_b32 s46, s17, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 22
-; VI-NEXT: s_lshr_b32 s46, s17, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 21
-; VI-NEXT: s_lshr_b32 s46, s17, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 20
-; VI-NEXT: s_lshr_b32 s46, s16, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 19
-; VI-NEXT: s_lshr_b32 s46, s16, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 18
-; VI-NEXT: s_lshr_b32 s46, s7, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 17
-; VI-NEXT: s_lshr_b32 s46, s7, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 16
-; VI-NEXT: s_lshr_b32 s46, s7, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 15
-; VI-NEXT: s_lshr_b32 s46, s6, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 14
-; VI-NEXT: s_lshr_b32 s46, s6, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 13
-; VI-NEXT: s_lshr_b32 s46, s9, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 12
-; VI-NEXT: s_lshr_b32 s46, s9, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 11
-; VI-NEXT: s_lshr_b32 s46, s9, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 10
-; VI-NEXT: s_lshr_b32 s46, s8, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 9
-; VI-NEXT: s_lshr_b32 s46, s8, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 8
-; VI-NEXT: s_lshr_b32 s46, s11, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 7
-; VI-NEXT: s_lshr_b32 s46, s11, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 6
-; VI-NEXT: s_lshr_b32 s46, s11, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 5
-; VI-NEXT: s_lshr_b32 s46, s10, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 4
-; VI-NEXT: s_lshr_b32 s46, s10, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 3
-; VI-NEXT: s_lshr_b32 s46, s13, 24
-; VI-NEXT: v_writelane_b32 v62, s46, 2
-; VI-NEXT: s_lshr_b32 s46, s13, 16
-; VI-NEXT: v_writelane_b32 v62, s46, 1
-; VI-NEXT: s_lshr_b32 s46, s12, 16
-; VI-NEXT: s_lshr_b32 s80, s13, 8
-; VI-NEXT: v_writelane_b32 v62, s46, 0
-; VI-NEXT: s_lshr_b32 s81, s12, 8
-; VI-NEXT: s_lshr_b32 s82, s15, 24
-; VI-NEXT: s_lshr_b32 s83, s15, 16
-; VI-NEXT: s_lshr_b32 s85, s15, 8
-; VI-NEXT: s_lshr_b32 s84, s14, 16
-; VI-NEXT: s_lshr_b32 s86, s14, 8
-; VI-NEXT: s_lshr_b32 s87, s41, 24
-; VI-NEXT: s_lshr_b32 s50, s41, 16
-; VI-NEXT: s_lshr_b32 s52, s41, 8
-; VI-NEXT: s_lshr_b32 s51, s40, 16
-; VI-NEXT: s_lshr_b32 s53, s40, 8
-; VI-NEXT: s_lshr_b32 s54, s43, 24
-; VI-NEXT: s_lshr_b32 s55, s43, 16
-; VI-NEXT: s_lshr_b32 s65, s43, 8
-; VI-NEXT: s_lshr_b32 s64, s42, 16
-; VI-NEXT: s_lshr_b32 s66, s42, 8
-; VI-NEXT: s_lshr_b32 s67, s45, 24
-; VI-NEXT: s_lshr_b32 s68, s45, 16
-; VI-NEXT: s_lshr_b32 s70, s45, 8
-; VI-NEXT: s_lshr_b32 s69, s44, 16
-; VI-NEXT: s_lshr_b32 s71, s44, 8
-; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[56:57], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24
-; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24
-; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24
-; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24
-; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24
-; VI-NEXT: s_lshr_b64 s[34:35], s[14:15], 24
-; VI-NEXT: s_lshr_b64 s[36:37], s[40:41], 24
-; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24
-; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24
+; VI-NEXT: s_lshr_b32 s6, s5, 24
+; VI-NEXT: v_writelane_b32 v62, s6, 19
+; VI-NEXT: s_lshr_b32 s6, s5, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 21
+; VI-NEXT: s_lshr_b32 s6, s5, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 18
+; VI-NEXT: s_lshr_b32 s6, s4, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 23
+; VI-NEXT: s_lshr_b32 s6, s4, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 20
+; VI-NEXT: s_lshr_b32 s6, s29, 24
+; VI-NEXT: v_writelane_b32 v62, s6, 25
+; VI-NEXT: s_lshr_b32 s6, s29, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 26
+; VI-NEXT: s_lshr_b32 s6, s29, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 22
+; VI-NEXT: s_lshr_b32 s6, s28, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 28
+; VI-NEXT: s_lshr_b32 s6, s28, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 24
+; VI-NEXT: s_lshr_b32 s6, s27, 24
+; VI-NEXT: v_writelane_b32 v62, s6, 30
+; VI-NEXT: s_lshr_b32 s6, s27, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 31
+; VI-NEXT: s_lshr_b32 s6, s27, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 27
+; VI-NEXT: s_lshr_b32 s6, s26, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 34
+; VI-NEXT: s_lshr_b32 s6, s26, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 29
+; VI-NEXT: s_lshr_b32 s6, s25, 24
+; VI-NEXT: v_writelane_b32 v62, s6, 36
+; VI-NEXT: s_lshr_b32 s6, s25, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 37
+; VI-NEXT: s_lshr_b32 s6, s25, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 32
+; VI-NEXT: s_lshr_b32 s6, s24, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 39
+; VI-NEXT: s_lshr_b32 s6, s24, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 33
+; VI-NEXT: s_lshr_b32 s6, s23, 24
+; VI-NEXT: v_writelane_b32 v62, s6, 41
+; VI-NEXT: s_lshr_b32 s6, s23, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 42
+; VI-NEXT: s_lshr_b32 s6, s23, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 35
+; VI-NEXT: s_lshr_b32 s6, s22, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 44
+; VI-NEXT: s_lshr_b32 s6, s22, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 38
+; VI-NEXT: s_lshr_b32 s6, s21, 24
+; VI-NEXT: v_writelane_b32 v62, s6, 46
+; VI-NEXT: s_lshr_b32 s6, s21, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 48
+; VI-NEXT: s_lshr_b32 s6, s21, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 40
+; VI-NEXT: s_lshr_b32 s6, s20, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 50
+; VI-NEXT: s_lshr_b32 s6, s20, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 43
+; VI-NEXT: s_lshr_b32 s6, s19, 24
+; VI-NEXT: v_writelane_b32 v62, s6, 52
+; VI-NEXT: s_lshr_b32 s6, s19, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 53
+; VI-NEXT: s_lshr_b32 s6, s19, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 45
+; VI-NEXT: s_lshr_b32 s6, s18, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 54
+; VI-NEXT: s_lshr_b32 s6, s18, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 47
+; VI-NEXT: s_lshr_b32 s6, s17, 24
+; VI-NEXT: v_writelane_b32 v62, s6, 55
+; VI-NEXT: s_lshr_b32 s6, s17, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 56
+; VI-NEXT: s_lshr_b32 s6, s17, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 49
+; VI-NEXT: s_lshr_b32 s6, s16, 16
+; VI-NEXT: v_writelane_b32 v62, s6, 57
+; VI-NEXT: s_lshr_b32 s6, s16, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 51
+; VI-NEXT: s_lshr_b32 s6, s59, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 16
+; VI-NEXT: s_lshr_b32 s6, s61, 8
+; VI-NEXT: v_writelane_b32 v62, s6, 17
+; VI-NEXT: s_lshr_b64 s[78:79], s[4:5], 24
+; VI-NEXT: v_writelane_b32 v62, s78, 14
+; VI-NEXT: v_writelane_b32 v62, s79, 15
+; VI-NEXT: s_lshr_b64 s[78:79], s[28:29], 24
+; VI-NEXT: v_writelane_b32 v62, s78, 12
+; VI-NEXT: v_writelane_b32 v62, s79, 13
+; VI-NEXT: s_lshr_b64 s[78:79], s[26:27], 24
+; VI-NEXT: v_writelane_b32 v62, s78, 10
+; VI-NEXT: v_writelane_b32 v62, s79, 11
+; VI-NEXT: s_lshr_b64 s[78:79], s[24:25], 24
+; VI-NEXT: v_writelane_b32 v62, s78, 8
+; VI-NEXT: v_writelane_b32 v62, s79, 9
+; VI-NEXT: s_lshr_b64 s[78:79], s[22:23], 24
+; VI-NEXT: v_writelane_b32 v62, s78, 6
+; VI-NEXT: v_writelane_b32 v62, s79, 7
+; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24
+; VI-NEXT: v_writelane_b32 v62, s78, 4
+; VI-NEXT: v_writelane_b32 v62, s79, 5
+; VI-NEXT: s_lshr_b64 s[78:79], s[18:19], 24
+; VI-NEXT: v_writelane_b32 v62, s78, 2
+; VI-NEXT: v_writelane_b32 v62, s79, 3
+; VI-NEXT: s_lshr_b64 s[78:79], s[16:17], 24
+; VI-NEXT: v_writelane_b32 v62, s78, 0
+; VI-NEXT: s_lshr_b32 s50, s45, 24
+; VI-NEXT: s_lshr_b32 s47, s45, 16
+; VI-NEXT: s_lshr_b32 s53, s45, 8
+; VI-NEXT: s_lshr_b32 s51, s44, 16
+; VI-NEXT: s_lshr_b32 s9, s44, 8
+; VI-NEXT: s_lshr_b32 s43, s57, 24
+; VI-NEXT: s_lshr_b32 s42, s57, 16
+; VI-NEXT: s_lshr_b32 s52, s57, 8
+; VI-NEXT: s_lshr_b32 s46, s56, 16
+; VI-NEXT: s_lshr_b32 s8, s56, 8
+; VI-NEXT: s_lshr_b32 s40, s59, 24
+; VI-NEXT: s_lshr_b32 s15, s59, 16
+; VI-NEXT: s_lshr_b32 s41, s58, 16
+; VI-NEXT: s_lshr_b32 s65, s58, 8
+; VI-NEXT: s_lshr_b32 s83, s61, 24
+; VI-NEXT: s_lshr_b32 s82, s61, 16
+; VI-NEXT: s_lshr_b32 s14, s60, 16
+; VI-NEXT: s_lshr_b32 s64, s60, 8
+; VI-NEXT: s_lshr_b32 s13, s63, 24
+; VI-NEXT: s_lshr_b32 s12, s63, 16
+; VI-NEXT: s_lshr_b32 s87, s63, 8
+; VI-NEXT: s_lshr_b32 s81, s62, 16
+; VI-NEXT: s_lshr_b32 s55, s62, 8
+; VI-NEXT: s_lshr_b32 s71, s73, 24
+; VI-NEXT: s_lshr_b32 s70, s73, 16
+; VI-NEXT: s_lshr_b32 s86, s73, 8
+; VI-NEXT: s_lshr_b32 s80, s72, 16
+; VI-NEXT: s_lshr_b32 s54, s72, 8
+; VI-NEXT: s_lshr_b32 s10, s75, 24
+; VI-NEXT: s_lshr_b32 s69, s75, 16
+; VI-NEXT: s_lshr_b32 s85, s75, 8
+; VI-NEXT: s_lshr_b32 s11, s74, 16
+; VI-NEXT: s_lshr_b32 s7, s74, 8
+; VI-NEXT: s_lshr_b32 s67, s77, 24
+; VI-NEXT: s_lshr_b32 s66, s77, 16
+; VI-NEXT: s_lshr_b32 s84, s77, 8
+; VI-NEXT: s_lshr_b32 s68, s76, 16
+; VI-NEXT: s_lshr_b32 s6, s76, 8
+; VI-NEXT: v_writelane_b32 v62, s79, 1
+; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24
+; VI-NEXT: s_lshr_b64 s[88:89], s[56:57], 24
+; VI-NEXT: s_lshr_b64 s[90:91], s[58:59], 24
+; VI-NEXT: s_lshr_b64 s[30:31], s[60:61], 24
+; VI-NEXT: s_lshr_b64 s[34:35], s[62:63], 24
+; VI-NEXT: s_lshr_b64 s[36:37], s[72:73], 24
+; VI-NEXT: s_lshr_b64 s[38:39], s[74:75], 24
+; VI-NEXT: s_lshr_b64 s[48:49], s[76:77], 24
; VI-NEXT: s_cbranch_execnz .LBB91_4
; VI-NEXT: .LBB91_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s46, s45, 16
-; VI-NEXT: v_mov_b32_e32 v31, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v1, s46, v31
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s45, s45, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s45, v31
+; VI-NEXT: s_and_b32 s6, s77, 0xffff0000
+; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s77, 16
+; VI-NEXT: v_cndmask_b32_e32 v23, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT: s_and_b32 s6, s76, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v9, v3, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s76, 16
+; VI-NEXT: v_cndmask_b32_e32 v24, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT: s_and_b32 s6, s75, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v8, v3, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s75, 16
+; VI-NEXT: v_cndmask_b32_e32 v48, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT: s_and_b32 s6, s74, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v11, v3, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s74, 16
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s45, s44, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s45, v31
-; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT: s_and_b32 s6, s73, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v10, v3, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s44, s44, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s44, v31
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s73, 16
+; VI-NEXT: v_cndmask_b32_e32 v54, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: s_lshl_b32 s44, s43, 16
-; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; VI-NEXT: v_add_f32_e32 v3, s44, v31
+; VI-NEXT: s_and_b32 s6, s72, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v13, v3, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s72, 16
+; VI-NEXT: v_cndmask_b32_e32 v44, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s43, s43, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s43, v31
-; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s43, s42, 16
-; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16
-; VI-NEXT: v_add_f32_e32 v3, s43, v31
-; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44
+; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; VI-NEXT: s_and_b32 s6, s63, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v12, v26, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s63, 16
+; VI-NEXT: v_cndmask_b32_e32 v59, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s42, s42, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-NEXT: v_add_f32_e32 v5, s42, v31
-; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: s_lshl_b32 s42, s41, 16
-; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; VI-NEXT: v_add_f32_e32 v5, s42, v31
-; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: s_and_b32 s41, s41, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT: v_add_f32_e32 v6, s41, v31
-; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
-; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: s_lshl_b32 s41, s40, 16
-; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16
-; VI-NEXT: v_add_f32_e32 v5, s41, v31
-; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
-; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: s_and_b32 s40, s40, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
-; VI-NEXT: v_add_f32_e32 v7, s40, v31
-; VI-NEXT: v_bfe_u32 v8, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7
-; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: s_lshl_b32 s40, s15, 16
-; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16
-; VI-NEXT: v_add_f32_e32 v7, s40, v31
-; VI-NEXT: v_bfe_u32 v8, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7
-; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT: s_and_b32 s15, s15, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
-; VI-NEXT: v_add_f32_e32 v8, s15, v31
-; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: s_lshl_b32 s15, s14, 16
-; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16
-; VI-NEXT: v_add_f32_e32 v7, s15, v31
-; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
-; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT: s_and_b32 s14, s14, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; VI-NEXT: v_add_f32_e32 v9, s14, v31
-; VI-NEXT: v_bfe_u32 v10, v9, 16, 1
-; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9
-; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
-; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: s_lshl_b32 s14, s13, 16
-; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16
-; VI-NEXT: v_add_f32_e32 v9, s14, v31
-; VI-NEXT: v_bfe_u32 v10, v9, 16, 1
-; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9
-; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
-; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT: s_and_b32 s13, s13, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
-; VI-NEXT: v_add_f32_e32 v10, s13, v31
-; VI-NEXT: v_bfe_u32 v11, v10, 16, 1
-; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10
-; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
-; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: s_lshl_b32 s13, s12, 16
-; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16
-; VI-NEXT: v_add_f32_e32 v9, s13, v31
-; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
-; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
-; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
-; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT: s_and_b32 s12, s12, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc
-; VI-NEXT: v_add_f32_e32 v11, s12, v31
-; VI-NEXT: v_bfe_u32 v12, v11, 16, 1
-; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11
-; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: s_lshl_b32 s12, s11, 16
-; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16
-; VI-NEXT: v_add_f32_e32 v11, s12, v31
-; VI-NEXT: v_bfe_u32 v12, v11, 16, 1
-; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11
-; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT: s_and_b32 s11, s11, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
-; VI-NEXT: v_add_f32_e32 v12, s11, v31
-; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
-; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
-; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: s_lshl_b32 s11, s10, 16
-; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16
-; VI-NEXT: v_add_f32_e32 v11, s11, v31
-; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
-; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
-; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT: s_and_b32 s10, s10, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc
-; VI-NEXT: v_add_f32_e32 v13, s10, v31
-; VI-NEXT: v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: s_lshl_b32 s10, s9, 16
-; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16
-; VI-NEXT: v_add_f32_e32 v13, s10, v31
-; VI-NEXT: v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT: s_and_b32 s9, s9, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
-; VI-NEXT: v_add_f32_e32 v14, s9, v31
-; VI-NEXT: v_bfe_u32 v15, v14, 16, 1
-; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14
-; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: s_lshl_b32 s9, s8, 16
-; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16
-; VI-NEXT: v_add_f32_e32 v13, s9, v31
-; VI-NEXT: v_bfe_u32 v15, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13
-; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT: s_and_b32 s8, s8, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc
-; VI-NEXT: v_add_f32_e32 v15, s8, v31
-; VI-NEXT: v_bfe_u32 v16, v15, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
+; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3
+; VI-NEXT: s_and_b32 s6, s62, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v7, v25, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s62, 16
+; VI-NEXT: v_cndmask_b32_e32 v61, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61
+; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; VI-NEXT: s_and_b32 s6, s61, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v6, v28, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s61, 16
+; VI-NEXT: v_cndmask_b32_e32 v37, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37
+; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v3
+; VI-NEXT: s_and_b32 s6, s60, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v15, v27, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s60, 16
+; VI-NEXT: v_cndmask_b32_e32 v36, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36
+; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; VI-NEXT: s_and_b32 s6, s59, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v14, v30, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s59, 16
+; VI-NEXT: v_cndmask_b32_e32 v20, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
+; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v3
+; VI-NEXT: s_and_b32 s6, s58, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v5, v29, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s58, 16
+; VI-NEXT: v_cndmask_b32_e32 v21, v3, v4, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; VI-NEXT: s_and_b32 s6, s57, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v4, v32, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s57, 16
+; VI-NEXT: v_cndmask_b32_e32 v22, v3, v16, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v16, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: s_lshl_b32 s8, s7, 16
-; VI-NEXT: v_alignbit_b32 v13, v15, v13, 16
-; VI-NEXT: v_add_f32_e32 v15, s8, v31
-; VI-NEXT: v_bfe_u32 v16, v15, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15
+; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22
+; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v3
+; VI-NEXT: s_and_b32 s6, s56, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v17, v31, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s56, 16
+; VI-NEXT: v_cndmask_b32_e32 v51, v3, v16, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v16, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT: s_and_b32 s7, s7, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
-; VI-NEXT: v_add_f32_e32 v16, s7, v31
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s7, s6, 16
-; VI-NEXT: v_alignbit_b32 v16, v16, v15, 16
-; VI-NEXT: v_add_f32_e32 v15, s7, v31
-; VI-NEXT: v_bfe_u32 v17, v15, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT: s_and_b32 s6, s6, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
-; VI-NEXT: v_add_f32_e32 v17, s6, v31
-; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v16, v18, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3
+; VI-NEXT: s_and_b32 s6, s45, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v16, v34, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s45, 16
+; VI-NEXT: v_cndmask_b32_e32 v52, v3, v18, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v18, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v3
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: s_lshl_b32 s6, s17, 16
-; VI-NEXT: v_alignbit_b32 v15, v17, v15, 16
-; VI-NEXT: v_add_f32_e32 v17, s6, v31
-; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
+; VI-NEXT: v_or_b32_e32 v19, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; VI-NEXT: s_and_b32 s6, s44, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v19, v33, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_lshl_b32 s6, s44, 16
+; VI-NEXT: v_cndmask_b32_e32 v40, v3, v18, vcc
+; VI-NEXT: v_add_f32_e32 v3, s6, v1
+; VI-NEXT: v_bfe_u32 v18, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v3
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; VI-NEXT: v_or_b32_e32 v35, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v18, v35, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3
; VI-NEXT: s_and_b32 s6, s17, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT: v_add_f32_e32 v18, s6, v31
-; VI-NEXT: v_bfe_u32 v19, v18, 16, 1
-; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18
-; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19
-; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
-; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; VI-NEXT: s_lshl_b32 s6, s16, 16
-; VI-NEXT: v_alignbit_b32 v18, v18, v17, 16
-; VI-NEXT: v_add_f32_e32 v17, s6, v31
-; VI-NEXT: v_bfe_u32 v19, v17, 16, 1
-; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v17
-; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19
-; VI-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; VI-NEXT: v_or_b32_e32 v18, v35, v2
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s78, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s17, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s78, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s17, s6, 16
; VI-NEXT: s_and_b32 s6, s16, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc
-; VI-NEXT: v_add_f32_e32 v19, s6, v31
-; VI-NEXT: v_bfe_u32 v20, v19, 16, 1
-; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19
-; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20
-; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: s_lshl_b32 s6, s19, 16
-; VI-NEXT: v_alignbit_b32 v17, v19, v17, 16
-; VI-NEXT: v_add_f32_e32 v19, s6, v31
-; VI-NEXT: v_bfe_u32 v20, v19, 16, 1
-; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19
-; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20
-; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s45, s17, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s79, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s16, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s79, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s16, s6, 16
; VI-NEXT: s_and_b32 s6, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
-; VI-NEXT: v_add_f32_e32 v20, s6, v31
-; VI-NEXT: v_bfe_u32 v21, v20, 16, 1
-; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20
-; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21
-; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
-; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: s_lshl_b32 s6, s18, 16
-; VI-NEXT: v_alignbit_b32 v20, v20, v19, 16
-; VI-NEXT: v_add_f32_e32 v19, s6, v31
-; VI-NEXT: v_bfe_u32 v21, v19, 16, 1
-; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19
-; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21
-; VI-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
-; VI-NEXT: s_and_b32 s6, s18, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v19, v21, v22, vcc
-; VI-NEXT: v_add_f32_e32 v21, s6, v31
-; VI-NEXT: v_bfe_u32 v22, v21, 16, 1
-; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21
-; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22
-; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: s_lshl_b32 s6, s21, 16
-; VI-NEXT: v_alignbit_b32 v19, v21, v19, 16
-; VI-NEXT: v_add_f32_e32 v21, s6, v31
-; VI-NEXT: v_bfe_u32 v22, v21, 16, 1
-; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21
-; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22
-; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT: s_and_b32 s6, s21, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
-; VI-NEXT: v_add_f32_e32 v22, s6, v31
-; VI-NEXT: v_bfe_u32 v23, v22, 16, 1
-; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22
-; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23
-; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
-; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: s_lshl_b32 s6, s20, 16
-; VI-NEXT: v_alignbit_b32 v22, v22, v21, 16
-; VI-NEXT: v_add_f32_e32 v21, s6, v31
-; VI-NEXT: v_bfe_u32 v23, v21, 16, 1
-; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21
-; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23
-; VI-NEXT: v_or_b32_e32 v24, 0x400000, v21
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
-; VI-NEXT: s_and_b32 s6, s20, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v21, v23, v24, vcc
-; VI-NEXT: v_add_f32_e32 v23, s6, v31
-; VI-NEXT: v_bfe_u32 v24, v23, 16, 1
-; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23
-; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24
-; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: s_lshl_b32 s6, s23, 16
-; VI-NEXT: v_alignbit_b32 v21, v23, v21, 16
-; VI-NEXT: v_add_f32_e32 v23, s6, v31
-; VI-NEXT: v_bfe_u32 v24, v23, 16, 1
-; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23
-; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24
-; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT: s_and_b32 s6, s23, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
-; VI-NEXT: v_add_f32_e32 v24, s6, v31
-; VI-NEXT: v_bfe_u32 v25, v24, 16, 1
-; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24
-; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25
-; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
-; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: s_lshl_b32 s6, s22, 16
-; VI-NEXT: v_alignbit_b32 v24, v24, v23, 16
-; VI-NEXT: v_add_f32_e32 v23, s6, v31
-; VI-NEXT: v_bfe_u32 v25, v23, 16, 1
-; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23
-; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25
-; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
-; VI-NEXT: s_and_b32 s6, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v23, v25, v26, vcc
-; VI-NEXT: v_add_f32_e32 v25, s6, v31
-; VI-NEXT: v_bfe_u32 v26, v25, 16, 1
-; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25
-; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26
-; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: s_lshl_b32 s6, s25, 16
-; VI-NEXT: v_alignbit_b32 v23, v25, v23, 16
-; VI-NEXT: v_add_f32_e32 v25, s6, v31
-; VI-NEXT: v_bfe_u32 v26, v25, 16, 1
-; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25
-; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26
-; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT: s_and_b32 s6, s25, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
-; VI-NEXT: v_add_f32_e32 v26, s6, v31
-; VI-NEXT: v_bfe_u32 v27, v26, 16, 1
-; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26
-; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27
-; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
-; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: s_lshl_b32 s6, s24, 16
-; VI-NEXT: v_alignbit_b32 v26, v26, v25, 16
-; VI-NEXT: v_add_f32_e32 v25, s6, v31
-; VI-NEXT: v_bfe_u32 v27, v25, 16, 1
-; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25
-; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27
-; VI-NEXT: v_or_b32_e32 v28, 0x400000, v25
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
-; VI-NEXT: s_and_b32 s6, s24, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v25, v27, v28, vcc
-; VI-NEXT: v_add_f32_e32 v27, s6, v31
-; VI-NEXT: v_bfe_u32 v28, v27, 16, 1
-; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27
-; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28
-; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: s_lshl_b32 s6, s27, 16
-; VI-NEXT: v_alignbit_b32 v25, v27, v25, 16
-; VI-NEXT: v_add_f32_e32 v27, s6, v31
-; VI-NEXT: v_bfe_u32 v28, v27, 16, 1
-; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27
-; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28
-; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT: s_and_b32 s6, s27, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc
-; VI-NEXT: v_add_f32_e32 v28, s6, v31
-; VI-NEXT: v_bfe_u32 v29, v28, 16, 1
-; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28
-; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29
-; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
-; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: s_lshl_b32 s6, s26, 16
-; VI-NEXT: v_alignbit_b32 v28, v28, v27, 16
-; VI-NEXT: v_add_f32_e32 v27, s6, v31
-; VI-NEXT: v_bfe_u32 v29, v27, 16, 1
-; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27
-; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29
-; VI-NEXT: v_or_b32_e32 v30, 0x400000, v27
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
-; VI-NEXT: s_and_b32 s6, s26, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v27, v29, v30, vcc
-; VI-NEXT: v_add_f32_e32 v29, s6, v31
-; VI-NEXT: v_bfe_u32 v30, v29, 16, 1
-; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29
-; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30
-; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: s_lshl_b32 s6, s29, 16
-; VI-NEXT: v_alignbit_b32 v27, v29, v27, 16
-; VI-NEXT: v_add_f32_e32 v29, s6, v31
-; VI-NEXT: v_bfe_u32 v30, v29, 16, 1
-; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29
-; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30
-; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT: s_and_b32 s6, s29, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc
-; VI-NEXT: v_add_f32_e32 v30, s6, v31
-; VI-NEXT: v_bfe_u32 v32, v30, 16, 1
-; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v30
-; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32
-; VI-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
-; VI-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: s_lshl_b32 s6, s28, 16
-; VI-NEXT: v_alignbit_b32 v30, v30, v29, 16
-; VI-NEXT: v_add_f32_e32 v29, s6, v31
-; VI-NEXT: v_bfe_u32 v32, v29, 16, 1
-; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v29
-; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32
-; VI-NEXT: v_or_b32_e32 v33, 0x400000, v29
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
-; VI-NEXT: s_and_b32 s6, s28, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc
-; VI-NEXT: v_add_f32_e32 v32, s6, v31
-; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
-; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
-; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
-; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT: s_lshl_b32 s6, s5, 16
-; VI-NEXT: v_alignbit_b32 v29, v32, v29, 16
-; VI-NEXT: v_add_f32_e32 v32, s6, v31
-; VI-NEXT: v_bfe_u32 v33, v32, 16, 1
-; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32
-; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33
-; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
-; VI-NEXT: s_and_b32 s5, s5, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
-; VI-NEXT: v_add_f32_e32 v33, s5, v31
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; VI-NEXT: s_lshl_b32 s5, s4, 16
-; VI-NEXT: v_alignbit_b32 v32, v33, v32, 16
-; VI-NEXT: v_add_f32_e32 v33, s5, v31
-; VI-NEXT: v_bfe_u32 v34, v33, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33
-; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34
-; VI-NEXT: s_and_b32 s4, s4, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
-; VI-NEXT: v_add_f32_e32 v31, s4, v31
-; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc
-; VI-NEXT: v_bfe_u32 v34, v31, 16, 1
-; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v31
-; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34
-; VI-NEXT: v_or_b32_e32 v35, 0x400000, v31
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
-; VI-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18
-; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12
-; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v8
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2
-; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24]
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17
-; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16
-; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v13
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v12
-; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v15
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v14
-; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v14
-; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v13
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v12
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v11
-; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v10
-; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v10
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v9
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8
-; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v3
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v1
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s44, s16, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s9, s8
+; VI-NEXT: s_lshl_b32 s8, s19, 16
+; VI-NEXT: v_add_f32_e32 v2, s8, v1
+; VI-NEXT: v_readfirstlane_b32 s8, v2
+; VI-NEXT: s_bfe_u32 s9, s8, 0x10010
+; VI-NEXT: s_add_i32 s9, s9, s8
+; VI-NEXT: s_and_b32 s7, s6, 0xffff0000
+; VI-NEXT: s_add_i32 s10, s9, 0x7fff
+; VI-NEXT: s_or_b32 s11, s8, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[8:9], vcc, exec
+; VI-NEXT: s_cselect_b32 s8, s11, s10
+; VI-NEXT: s_lshr_b32 s19, s8, 16
+; VI-NEXT: s_or_b32 s57, s19, s7
+; VI-NEXT: s_and_b32 s7, s18, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s7, v1
+; VI-NEXT: v_readfirstlane_b32 s7, v2
+; VI-NEXT: s_bfe_u32 s8, s7, 0x10010
+; VI-NEXT: s_add_i32 s8, s8, s7
+; VI-NEXT: s_add_i32 s10, s8, 0x7fff
+; VI-NEXT: s_bitset1_b32 s7, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[8:9], vcc, exec
+; VI-NEXT: s_cselect_b32 s7, s7, s10
+; VI-NEXT: s_lshl_b32 s8, s18, 16
+; VI-NEXT: v_add_f32_e32 v2, s8, v1
+; VI-NEXT: v_readfirstlane_b32 s8, v2
+; VI-NEXT: s_bfe_u32 s9, s8, 0x10010
+; VI-NEXT: s_add_i32 s9, s9, s8
+; VI-NEXT: s_and_b32 s10, s7, 0xffff0000
+; VI-NEXT: s_add_i32 s11, s9, 0x7fff
+; VI-NEXT: s_or_b32 s12, s8, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[8:9], vcc, exec
+; VI-NEXT: s_cselect_b32 s8, s12, s11
+; VI-NEXT: s_lshr_b32 s18, s8, 16
+; VI-NEXT: s_and_b32 s8, s21, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s8, v1
+; VI-NEXT: v_readfirstlane_b32 s8, v2
+; VI-NEXT: s_bfe_u32 s9, s8, 0x10010
+; VI-NEXT: s_add_i32 s9, s9, s8
+; VI-NEXT: s_or_b32 s56, s18, s10
+; VI-NEXT: s_add_i32 s10, s9, 0x7fff
+; VI-NEXT: s_or_b32 s11, s8, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[8:9], vcc, exec
+; VI-NEXT: s_cselect_b32 s8, s11, s10
+; VI-NEXT: s_lshl_b32 s10, s21, 16
+; VI-NEXT: v_add_f32_e32 v2, s10, v1
+; VI-NEXT: v_readfirstlane_b32 s10, v2
+; VI-NEXT: s_bfe_u32 s11, s10, 0x10010
+; VI-NEXT: s_add_i32 s11, s11, s10
+; VI-NEXT: s_and_b32 s9, s8, 0xffff0000
+; VI-NEXT: s_add_i32 s12, s11, 0x7fff
+; VI-NEXT: s_or_b32 s13, s10, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[10:11], vcc, exec
+; VI-NEXT: s_cselect_b32 s10, s13, s12
+; VI-NEXT: s_lshr_b32 s21, s10, 16
+; VI-NEXT: s_or_b32 s59, s21, s9
+; VI-NEXT: s_and_b32 s9, s20, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s9, v1
+; VI-NEXT: v_readfirstlane_b32 s9, v2
+; VI-NEXT: s_bfe_u32 s10, s9, 0x10010
+; VI-NEXT: s_add_i32 s10, s10, s9
+; VI-NEXT: s_add_i32 s12, s10, 0x7fff
+; VI-NEXT: s_bitset1_b32 s9, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[10:11], vcc, exec
+; VI-NEXT: s_cselect_b32 s9, s9, s12
+; VI-NEXT: s_lshl_b32 s10, s20, 16
+; VI-NEXT: v_add_f32_e32 v2, s10, v1
+; VI-NEXT: v_readfirstlane_b32 s10, v2
+; VI-NEXT: s_bfe_u32 s11, s10, 0x10010
+; VI-NEXT: s_add_i32 s11, s11, s10
+; VI-NEXT: s_and_b32 s12, s9, 0xffff0000
+; VI-NEXT: s_add_i32 s13, s11, 0x7fff
+; VI-NEXT: s_or_b32 s14, s10, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[10:11], vcc, exec
+; VI-NEXT: s_cselect_b32 s10, s14, s13
+; VI-NEXT: s_lshr_b32 s20, s10, 16
+; VI-NEXT: s_and_b32 s10, s23, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s10, v1
+; VI-NEXT: v_readfirstlane_b32 s10, v2
+; VI-NEXT: s_bfe_u32 s11, s10, 0x10010
+; VI-NEXT: s_add_i32 s11, s11, s10
+; VI-NEXT: s_or_b32 s58, s20, s12
+; VI-NEXT: s_add_i32 s12, s11, 0x7fff
+; VI-NEXT: s_or_b32 s13, s10, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[10:11], vcc, exec
+; VI-NEXT: s_cselect_b32 s10, s13, s12
+; VI-NEXT: s_lshl_b32 s12, s23, 16
+; VI-NEXT: v_add_f32_e32 v2, s12, v1
+; VI-NEXT: v_readfirstlane_b32 s12, v2
+; VI-NEXT: s_bfe_u32 s13, s12, 0x10010
+; VI-NEXT: s_add_i32 s13, s13, s12
+; VI-NEXT: s_and_b32 s11, s10, 0xffff0000
+; VI-NEXT: s_add_i32 s14, s13, 0x7fff
+; VI-NEXT: s_or_b32 s15, s12, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[12:13], vcc, exec
+; VI-NEXT: s_cselect_b32 s12, s15, s14
+; VI-NEXT: s_lshr_b32 s23, s12, 16
+; VI-NEXT: s_or_b32 s61, s23, s11
+; VI-NEXT: s_and_b32 s11, s22, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s11, v1
+; VI-NEXT: v_readfirstlane_b32 s11, v2
+; VI-NEXT: s_bfe_u32 s12, s11, 0x10010
+; VI-NEXT: s_add_i32 s12, s12, s11
+; VI-NEXT: s_add_i32 s14, s12, 0x7fff
+; VI-NEXT: s_bitset1_b32 s11, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[12:13], vcc, exec
+; VI-NEXT: s_cselect_b32 s11, s11, s14
+; VI-NEXT: s_lshl_b32 s12, s22, 16
+; VI-NEXT: v_add_f32_e32 v2, s12, v1
+; VI-NEXT: v_readfirstlane_b32 s12, v2
+; VI-NEXT: s_bfe_u32 s13, s12, 0x10010
+; VI-NEXT: s_add_i32 s13, s13, s12
+; VI-NEXT: s_and_b32 s14, s11, 0xffff0000
+; VI-NEXT: s_add_i32 s15, s13, 0x7fff
+; VI-NEXT: s_or_b32 s22, s12, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[12:13], vcc, exec
+; VI-NEXT: s_cselect_b32 s12, s22, s15
+; VI-NEXT: s_lshr_b32 s22, s12, 16
+; VI-NEXT: s_and_b32 s12, s25, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s12, v1
+; VI-NEXT: v_readfirstlane_b32 s12, v2
+; VI-NEXT: s_bfe_u32 s13, s12, 0x10010
+; VI-NEXT: s_add_i32 s13, s13, s12
+; VI-NEXT: s_or_b32 s60, s22, s14
+; VI-NEXT: s_add_i32 s14, s13, 0x7fff
+; VI-NEXT: s_or_b32 s15, s12, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[12:13], vcc, exec
+; VI-NEXT: s_cselect_b32 s12, s15, s14
+; VI-NEXT: s_lshl_b32 s14, s25, 16
+; VI-NEXT: v_add_f32_e32 v2, s14, v1
+; VI-NEXT: v_readfirstlane_b32 s14, v2
+; VI-NEXT: s_bfe_u32 s15, s14, 0x10010
+; VI-NEXT: s_add_i32 s15, s15, s14
+; VI-NEXT: s_and_b32 s13, s12, 0xffff0000
+; VI-NEXT: s_add_i32 s25, s15, 0x7fff
+; VI-NEXT: s_or_b32 s40, s14, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[14:15], vcc, exec
+; VI-NEXT: s_cselect_b32 s14, s40, s25
+; VI-NEXT: s_lshr_b32 s25, s14, 16
+; VI-NEXT: s_or_b32 s63, s25, s13
+; VI-NEXT: s_and_b32 s13, s24, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s13, v1
+; VI-NEXT: v_readfirstlane_b32 s13, v2
+; VI-NEXT: s_bfe_u32 s14, s13, 0x10010
+; VI-NEXT: s_add_i32 s14, s14, s13
+; VI-NEXT: s_add_i32 s40, s14, 0x7fff
+; VI-NEXT: s_bitset1_b32 s13, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[14:15], vcc, exec
+; VI-NEXT: s_cselect_b32 s13, s13, s40
+; VI-NEXT: s_lshl_b32 s14, s24, 16
+; VI-NEXT: v_add_f32_e32 v2, s14, v1
+; VI-NEXT: v_readfirstlane_b32 s14, v2
+; VI-NEXT: s_bfe_u32 s15, s14, 0x10010
+; VI-NEXT: s_add_i32 s15, s15, s14
+; VI-NEXT: s_and_b32 s40, s13, 0xffff0000
+; VI-NEXT: s_add_i32 s24, s15, 0x7fff
+; VI-NEXT: s_or_b32 s41, s14, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[14:15], vcc, exec
+; VI-NEXT: s_cselect_b32 s14, s41, s24
+; VI-NEXT: s_lshr_b32 s24, s14, 16
+; VI-NEXT: s_and_b32 s14, s27, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s14, v1
+; VI-NEXT: v_readfirstlane_b32 s14, v2
+; VI-NEXT: s_bfe_u32 s15, s14, 0x10010
+; VI-NEXT: s_add_i32 s15, s15, s14
+; VI-NEXT: s_or_b32 s62, s24, s40
+; VI-NEXT: s_add_i32 s40, s15, 0x7fff
+; VI-NEXT: s_or_b32 s41, s14, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[14:15], vcc, exec
+; VI-NEXT: s_cselect_b32 s14, s41, s40
+; VI-NEXT: s_lshl_b32 s27, s27, 16
+; VI-NEXT: v_add_f32_e32 v2, s27, v1
+; VI-NEXT: v_readfirstlane_b32 s27, v2
+; VI-NEXT: s_bfe_u32 s40, s27, 0x10010
+; VI-NEXT: s_add_i32 s40, s40, s27
+; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v11
+; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v10
+; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[10:11]
+; VI-NEXT: s_and_b32 s15, s14, 0xffff0000
+; VI-NEXT: s_add_i32 s42, s40, 0x7fff
+; VI-NEXT: s_bitset1_b32 s27, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v40
+; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v20
+; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v20
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: s_and_b64 s[40:41], vcc, exec
+; VI-NEXT: s_cselect_b32 s27, s27, s42
+; VI-NEXT: s_lshr_b32 s27, s27, 16
+; VI-NEXT: s_or_b32 s73, s27, s15
+; VI-NEXT: s_and_b32 s15, s26, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s15, v1
+; VI-NEXT: v_readfirstlane_b32 s15, v2
+; VI-NEXT: s_bfe_u32 s40, s15, 0x10010
+; VI-NEXT: s_add_i32 s40, s40, s15
+; VI-NEXT: s_add_i32 s42, s40, 0x7fff
+; VI-NEXT: s_bitset1_b32 s15, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[40:41], vcc, exec
+; VI-NEXT: s_cselect_b32 s15, s15, s42
+; VI-NEXT: s_lshl_b32 s26, s26, 16
+; VI-NEXT: v_add_f32_e32 v2, s26, v1
+; VI-NEXT: v_readfirstlane_b32 s26, v2
+; VI-NEXT: s_bfe_u32 s40, s26, 0x10010
+; VI-NEXT: s_add_i32 s40, s40, s26
+; VI-NEXT: s_and_b32 s42, s15, 0xffff0000
+; VI-NEXT: s_add_i32 s43, s40, 0x7fff
+; VI-NEXT: s_bitset1_b32 s26, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[40:41], vcc, exec
+; VI-NEXT: s_cselect_b32 s26, s26, s43
+; VI-NEXT: s_and_b32 s40, s29, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s40, v1
+; VI-NEXT: v_readfirstlane_b32 s40, v2
+; VI-NEXT: s_bfe_u32 s41, s40, 0x10010
+; VI-NEXT: s_lshr_b32 s26, s26, 16
+; VI-NEXT: s_add_i32 s41, s41, s40
+; VI-NEXT: s_or_b32 s72, s26, s42
+; VI-NEXT: s_add_i32 s42, s41, 0x7fff
+; VI-NEXT: s_or_b32 s43, s40, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[40:41], vcc, exec
+; VI-NEXT: s_cselect_b32 s40, s43, s42
+; VI-NEXT: s_lshl_b32 s29, s29, 16
+; VI-NEXT: v_add_f32_e32 v2, s29, v1
+; VI-NEXT: v_readfirstlane_b32 s29, v2
+; VI-NEXT: s_bfe_u32 s42, s29, 0x10010
+; VI-NEXT: s_add_i32 s42, s42, s29
+; VI-NEXT: s_and_b32 s41, s40, 0xffff0000
+; VI-NEXT: s_add_i32 s46, s42, 0x7fff
+; VI-NEXT: s_bitset1_b32 s29, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s29, s29, s46
+; VI-NEXT: s_lshr_b32 s29, s29, 16
+; VI-NEXT: s_or_b32 s75, s29, s41
+; VI-NEXT: s_and_b32 s41, s28, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s41, v1
+; VI-NEXT: v_readfirstlane_b32 s41, v2
+; VI-NEXT: s_bfe_u32 s42, s41, 0x10010
+; VI-NEXT: s_add_i32 s42, s42, s41
+; VI-NEXT: s_add_i32 s46, s42, 0x7fff
+; VI-NEXT: s_bitset1_b32 s41, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s41, s41, s46
+; VI-NEXT: s_lshl_b32 s28, s28, 16
+; VI-NEXT: v_add_f32_e32 v2, s28, v1
+; VI-NEXT: v_readfirstlane_b32 s28, v2
+; VI-NEXT: s_bfe_u32 s42, s28, 0x10010
+; VI-NEXT: s_add_i32 s42, s42, s28
+; VI-NEXT: s_and_b32 s46, s41, 0xffff0000
+; VI-NEXT: s_add_i32 s47, s42, 0x7fff
+; VI-NEXT: s_bitset1_b32 s28, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s28, s28, s47
+; VI-NEXT: s_and_b32 s42, s5, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s42, v1
+; VI-NEXT: v_readfirstlane_b32 s42, v2
+; VI-NEXT: s_bfe_u32 s43, s42, 0x10010
+; VI-NEXT: s_lshr_b32 s28, s28, 16
+; VI-NEXT: s_add_i32 s43, s43, s42
+; VI-NEXT: s_or_b32 s74, s28, s46
+; VI-NEXT: s_add_i32 s46, s43, 0x7fff
+; VI-NEXT: s_or_b32 s47, s42, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s46, s47, s46
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: v_add_f32_e32 v2, s5, v1
+; VI-NEXT: v_readfirstlane_b32 s5, v2
+; VI-NEXT: s_bfe_u32 s42, s5, 0x10010
+; VI-NEXT: s_add_i32 s42, s42, s5
+; VI-NEXT: s_and_b32 s47, s46, 0xffff0000
+; VI-NEXT: s_add_i32 s76, s42, 0x7fff
+; VI-NEXT: s_bitset1_b32 s5, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s5, s5, s76
+; VI-NEXT: s_and_b32 s42, s4, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s42, v1
+; VI-NEXT: v_readfirstlane_b32 s42, v2
+; VI-NEXT: s_bfe_u32 s43, s42, 0x10010
+; VI-NEXT: s_lshr_b32 s5, s5, 16
+; VI-NEXT: s_add_i32 s43, s43, s42
+; VI-NEXT: s_or_b32 s77, s5, s47
+; VI-NEXT: s_add_i32 s47, s43, 0x7fff
+; VI-NEXT: s_or_b32 s76, s42, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: s_cselect_b32 s47, s76, s47
+; VI-NEXT: s_lshl_b32 s4, s4, 16
+; VI-NEXT: v_add_f32_e32 v1, s4, v1
+; VI-NEXT: v_readfirstlane_b32 s4, v1
+; VI-NEXT: s_bfe_u32 s42, s4, 0x10010
+; VI-NEXT: s_add_i32 s42, s42, s4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[18:19]
+; VI-NEXT: s_and_b32 s76, s47, 0xffff0000
+; VI-NEXT: s_add_i32 s88, s42, 0x7fff
+; VI-NEXT: s_bitset1_b32 s4, 22
+; VI-NEXT: v_lshrrev_b64 v[2:3], 24, v[16:17]
+; VI-NEXT: s_and_b64 s[42:43], vcc, exec
+; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v4
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[4:5]
+; VI-NEXT: s_cselect_b32 s4, s4, s88
+; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v5
+; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[14:15]
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v15
+; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v6
+; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[6:7]
+; VI-NEXT: s_or_b32 s76, s4, s76
+; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v14
+; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v7
+; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[12:13]
+; VI-NEXT: s_lshr_b64 s[88:89], s[76:77], 24
+; VI-NEXT: s_lshr_b64 s[30:31], s[74:75], 24
+; VI-NEXT: s_lshr_b64 s[34:35], s[72:73], 24
+; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v8
+; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[8:9]
+; VI-NEXT: s_lshr_b64 s[38:39], s[62:63], 24
+; VI-NEXT: s_lshr_b64 s[48:49], s[60:61], 24
+; VI-NEXT: s_lshr_b64 s[50:51], s[58:59], 24
+; VI-NEXT: s_lshr_b64 s[52:53], s[56:57], 24
+; VI-NEXT: s_lshr_b64 s[54:55], s[44:45], 24
+; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v19
+; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v18
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v16
+; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v13
+; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v12
+; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v9
+; VI-NEXT: s_lshr_b32 s42, s77, 8
+; VI-NEXT: s_lshr_b32 s76, s76, 8
+; VI-NEXT: s_lshr_b32 s75, s75, 8
+; VI-NEXT: s_lshr_b32 s74, s74, 8
+; VI-NEXT: s_lshr_b32 s73, s73, 8
+; VI-NEXT: s_lshr_b32 s72, s72, 8
+; VI-NEXT: s_lshr_b32 s63, s63, 8
+; VI-NEXT: s_lshr_b32 s62, s62, 8
+; VI-NEXT: s_lshr_b32 s61, s61, 8
+; VI-NEXT: s_lshr_b32 s60, s60, 8
+; VI-NEXT: s_lshr_b32 s59, s59, 8
+; VI-NEXT: s_lshr_b32 s58, s58, 8
+; VI-NEXT: s_lshr_b32 s77, s57, 8
+; VI-NEXT: s_lshr_b32 s90, s56, 8
+; VI-NEXT: s_lshr_b32 vcc_lo, s45, 8
+; VI-NEXT: s_lshr_b32 vcc_hi, s44, 8
+; VI-NEXT: s_lshr_b32 s43, s46, 24
+; VI-NEXT: s_lshr_b32 s44, s46, 16
+; VI-NEXT: s_lshr_b32 s45, s47, 16
+; VI-NEXT: s_lshr_b32 s46, s40, 24
+; VI-NEXT: s_lshr_b32 s40, s40, 16
+; VI-NEXT: s_lshr_b32 s41, s41, 16
+; VI-NEXT: s_lshr_b32 s47, s14, 24
+; VI-NEXT: s_lshr_b32 s14, s14, 16
+; VI-NEXT: s_lshr_b32 s15, s15, 16
+; VI-NEXT: s_lshr_b32 s56, s12, 24
+; VI-NEXT: s_lshr_b32 s12, s12, 16
+; VI-NEXT: s_lshr_b32 s13, s13, 16
+; VI-NEXT: s_lshr_b32 s57, s10, 24
+; VI-NEXT: s_lshr_b32 s10, s10, 16
+; VI-NEXT: s_lshr_b32 s11, s11, 16
+; VI-NEXT: s_lshr_b32 s89, s8, 24
+; VI-NEXT: s_lshr_b32 s91, s8, 16
+; VI-NEXT: s_lshr_b32 s9, s9, 16
+; VI-NEXT: s_lshr_b32 s31, s6, 24
+; VI-NEXT: s_lshr_b32 s35, s6, 16
+; VI-NEXT: s_lshr_b32 s36, s7, 16
+; VI-NEXT: s_lshr_b32 s37, s78, 24
+; VI-NEXT: s_lshr_b32 s78, s78, 16
+; VI-NEXT: s_lshr_b32 s8, s79, 16
+; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v52
+; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v52
+; VI-NEXT: v_lshrrev_b32_e32 v12, 24, v22
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v22
+; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v51
+; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v21
+; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v37
+; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v37
+; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v36
+; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v59
+; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v59
+; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v61
+; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v54
+; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v44
+; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v48
+; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v20
+; VI-NEXT: v_lshrrev_b32_e32 v20, 24, v23
+; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v23
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v24
; VI-NEXT: s_branch .LBB91_5
; VI-NEXT: .LBB91_3:
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr71
-; VI-NEXT: ; implicit-def: $sgpr69
-; VI-NEXT: ; implicit-def: $sgpr70
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr78
+; VI-NEXT: ; implicit-def: $sgpr8
+; VI-NEXT: ; kill: killed $sgpr8
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s78, 0
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s79, 1
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr78
+; VI-NEXT: ; implicit-def: $sgpr8
; VI-NEXT: ; implicit-def: $sgpr68
-; VI-NEXT: ; implicit-def: $sgpr67
+; VI-NEXT: ; implicit-def: $sgpr84
; VI-NEXT: ; implicit-def: $sgpr66
+; VI-NEXT: ; implicit-def: $sgpr67
+; VI-NEXT: ; implicit-def: $sgpr7
+; VI-NEXT: ; implicit-def: $sgpr11
+; VI-NEXT: ; implicit-def: $sgpr85
+; VI-NEXT: ; implicit-def: $sgpr69
+; VI-NEXT: ; implicit-def: $sgpr10
+; VI-NEXT: ; implicit-def: $sgpr54
+; VI-NEXT: ; implicit-def: $sgpr80
+; VI-NEXT: ; implicit-def: $sgpr86
+; VI-NEXT: ; implicit-def: $sgpr70
+; VI-NEXT: ; implicit-def: $sgpr71
+; VI-NEXT: ; implicit-def: $sgpr55
+; VI-NEXT: ; implicit-def: $sgpr81
+; VI-NEXT: ; implicit-def: $sgpr87
+; VI-NEXT: ; implicit-def: $sgpr12
+; VI-NEXT: ; implicit-def: $sgpr13
; VI-NEXT: ; implicit-def: $sgpr64
+; VI-NEXT: ; implicit-def: $sgpr14
+; VI-NEXT: ; implicit-def: $sgpr82
+; VI-NEXT: ; implicit-def: $sgpr83
; VI-NEXT: ; implicit-def: $sgpr65
-; VI-NEXT: ; implicit-def: $sgpr55
-; VI-NEXT: ; implicit-def: $sgpr54
-; VI-NEXT: ; implicit-def: $sgpr53
-; VI-NEXT: ; implicit-def: $sgpr51
+; VI-NEXT: ; implicit-def: $sgpr41
+; VI-NEXT: ; kill: killed $sgpr8
+; VI-NEXT: ; implicit-def: $sgpr15
+; VI-NEXT: ; implicit-def: $sgpr40
+; VI-NEXT: ; implicit-def: $sgpr8
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s78, 2
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s79, 3
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr78
+; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; implicit-def: $sgpr52
+; VI-NEXT: ; implicit-def: $sgpr42
+; VI-NEXT: ; implicit-def: $sgpr43
+; VI-NEXT: ; implicit-def: $sgpr9
+; VI-NEXT: ; implicit-def: $sgpr51
+; VI-NEXT: ; implicit-def: $sgpr53
+; VI-NEXT: ; implicit-def: $sgpr47
; VI-NEXT: ; implicit-def: $sgpr50
-; VI-NEXT: ; implicit-def: $sgpr87
-; VI-NEXT: ; implicit-def: $sgpr86
-; VI-NEXT: ; implicit-def: $sgpr84
-; VI-NEXT: ; implicit-def: $sgpr85
-; VI-NEXT: ; implicit-def: $sgpr83
-; VI-NEXT: ; implicit-def: $sgpr82
-; VI-NEXT: ; implicit-def: $sgpr81
-; VI-NEXT: ; implicit-def: $sgpr80
-; VI-NEXT: ; implicit-def: $sgpr76
-; VI-NEXT: ; implicit-def: $sgpr74
-; VI-NEXT: ; implicit-def: $sgpr72
-; VI-NEXT: ; implicit-def: $sgpr62
-; VI-NEXT: ; implicit-def: $sgpr60
-; VI-NEXT: ; implicit-def: $sgpr58
-; VI-NEXT: ; implicit-def: $sgpr56
; VI-NEXT: ; implicit-def: $sgpr48
; VI-NEXT: ; implicit-def: $sgpr38
; VI-NEXT: ; implicit-def: $sgpr36
@@ -165349,406 +166747,433 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: ; implicit-def: $sgpr30
; VI-NEXT: ; implicit-def: $sgpr90
; VI-NEXT: ; implicit-def: $sgpr88
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s78, 4
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s79, 5
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
; VI-NEXT: ; implicit-def: $sgpr78
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s78, 6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s79, 7
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr78
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s78, 8
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s79, 9
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr78
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s78, 10
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s79, 11
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr78
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s78, 12
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s79, 13
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr78
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s78, 14
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: v_writelane_b32 v62, s79, 15
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr78
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; kill: killed $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr6
; VI-NEXT: s_branch .LBB91_2
; VI-NEXT: .LBB91_4:
-; VI-NEXT: v_mov_b32_e32 v33, s71
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s69
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s70
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s68
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s67
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s86
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s83
-; VI-NEXT: v_mov_b32_e32 v31, s4
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s82
-; VI-NEXT: v_readlane_b32 s4, v62, 0
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 1
-; VI-NEXT: v_mov_b32_e32 v40, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 2
-; VI-NEXT: v_mov_b32_e32 v44, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 3
-; VI-NEXT: v_mov_b32_e32 v54, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 4
-; VI-NEXT: v_mov_b32_e32 v53, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 5
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 6
-; VI-NEXT: v_mov_b32_e32 v51, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 7
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 8
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 9
-; VI-NEXT: v_mov_b32_e32 v56, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 10
-; VI-NEXT: v_mov_b32_e32 v47, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 11
-; VI-NEXT: v_mov_b32_e32 v48, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 12
-; VI-NEXT: v_mov_b32_e32 v43, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 13
-; VI-NEXT: v_mov_b32_e32 v46, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 14
-; VI-NEXT: v_mov_b32_e32 v50, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 15
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 16
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 17
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 18
-; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v33, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 19
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 20
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 21
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 22
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 23
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 24
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 25
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 26
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 27
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 28
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 29
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 30
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 31
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 32
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 33
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 34
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 35
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 36
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 37
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 38
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 39
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 40
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 41
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 42
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 43
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 44
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 45
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 46
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 47
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 48
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 49
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 50
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 51
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 52
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 53
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 54
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 55
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 56
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_readlane_b32 s4, v62, 57
-; VI-NEXT: v_mov_b32_e32 v42, s54
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v55, s4
-; VI-NEXT: v_mov_b32_e32 v41, s46
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v41, s56
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v41, s58
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v41, s60
-; VI-NEXT: v_mov_b32_e32 v45, s72
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v45, s74
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v45, s76
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v45, s78
-; VI-NEXT: v_mov_b32_e32 v55, s88
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v36, s66
-; VI-NEXT: v_mov_b32_e32 v52, s64
-; VI-NEXT: v_mov_b32_e32 v55, v50
-; VI-NEXT: v_mov_b32_e32 v35, s30
-; VI-NEXT: v_mov_b32_e32 v59, s87
-; VI-NEXT: v_mov_b32_e32 v58, s34
-; VI-NEXT: v_mov_b32_e32 v45, s36
-; VI-NEXT: v_mov_b32_e32 v34, s38
-; VI-NEXT: v_mov_b32_e32 v1, s44
-; VI-NEXT: v_mov_b32_e32 v2, s45
-; VI-NEXT: v_mov_b32_e32 v3, s42
-; VI-NEXT: v_mov_b32_e32 v4, s43
-; VI-NEXT: v_mov_b32_e32 v5, s40
-; VI-NEXT: v_mov_b32_e32 v6, s41
-; VI-NEXT: v_mov_b32_e32 v7, s14
-; VI-NEXT: v_mov_b32_e32 v8, s15
-; VI-NEXT: v_mov_b32_e32 v9, s12
-; VI-NEXT: v_mov_b32_e32 v10, s13
-; VI-NEXT: v_mov_b32_e32 v11, s10
-; VI-NEXT: v_mov_b32_e32 v12, s11
-; VI-NEXT: v_mov_b32_e32 v13, s8
-; VI-NEXT: v_mov_b32_e32 v14, s9
-; VI-NEXT: v_mov_b32_e32 v15, s6
-; VI-NEXT: v_mov_b32_e32 v16, s7
-; VI-NEXT: v_mov_b32_e32 v17, s16
-; VI-NEXT: v_mov_b32_e32 v18, s17
-; VI-NEXT: v_mov_b32_e32 v19, s18
-; VI-NEXT: v_mov_b32_e32 v20, s19
-; VI-NEXT: v_mov_b32_e32 v21, s20
-; VI-NEXT: v_mov_b32_e32 v22, s21
-; VI-NEXT: v_mov_b32_e32 v23, s22
-; VI-NEXT: v_mov_b32_e32 v24, s23
-; VI-NEXT: v_mov_b32_e32 v25, s24
-; VI-NEXT: v_mov_b32_e32 v26, s25
-; VI-NEXT: v_mov_b32_e32 v27, s26
-; VI-NEXT: v_mov_b32_e32 v28, s27
-; VI-NEXT: v_mov_b32_e32 v29, s28
-; VI-NEXT: v_mov_b32_e32 v30, s29
-; VI-NEXT: v_mov_b32_e32 v32, s5
-; VI-NEXT: v_mov_b32_e32 v41, s62
-; VI-NEXT: v_mov_b32_e32 v57, s81
-; VI-NEXT: v_mov_b32_e32 v37, s84
-; VI-NEXT: v_mov_b32_e32 v60, s52
-; VI-NEXT: v_mov_b32_e32 v38, s51
-; VI-NEXT: v_mov_b32_e32 v61, s65
-; VI-NEXT: v_mov_b32_e32 v49, s66
-; VI-NEXT: v_mov_b32_e32 v39, s55
-; VI-NEXT: v_mov_b32_e32 v50, v46
-; VI-NEXT: v_mov_b32_e32 v46, v48
-; VI-NEXT: v_mov_b32_e32 v48, v47
-; VI-NEXT: v_mov_b32_e32 v47, v56
-; VI-NEXT: v_mov_b32_e32 v56, v51
-; VI-NEXT: v_mov_b32_e32 v51, s90
-; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v35, s85
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v34, s48
-; VI-NEXT: v_mov_b32_e32 v51, v53
-; VI-NEXT: v_mov_b32_e32 v53, v54
-; VI-NEXT: v_mov_b32_e32 v54, v40
-; VI-NEXT: v_mov_b32_e32 v40, s80
-; VI-NEXT: v_mov_b32_e32 v58, s50
-; VI-NEXT: v_mov_b32_e32 v45, s53
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v1, s73
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v1, s74
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v1, s75
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v1, s76
+; VI-NEXT: v_mov_b32_e32 v16, s8
+; VI-NEXT: v_readlane_b32 s8, v62, 16
+; VI-NEXT: v_mov_b32_e32 v2, s88
+; VI-NEXT: v_mov_b32_e32 v4, s30
+; VI-NEXT: v_mov_b32_e32 v5, s34
+; VI-NEXT: v_readlane_b32 s88, v62, 14
+; VI-NEXT: v_readlane_b32 s30, v62, 12
+; VI-NEXT: v_readlane_b32 s34, v62, 10
+; VI-NEXT: v_mov_b32_e32 v11, s51
+; VI-NEXT: v_mov_b32_e32 v8, s50
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v1, s77
+; VI-NEXT: v_mov_b32_e32 v19, s53
+; VI-NEXT: v_mov_b32_e32 v17, s52
+; VI-NEXT: v_mov_b32_e32 v38, s8
+; VI-NEXT: v_readlane_b32 s8, v62, 17
+; VI-NEXT: v_mov_b32_e32 v15, s55
+; VI-NEXT: v_mov_b32_e32 v55, s54
+; VI-NEXT: v_mov_b32_e32 v10, s38
+; VI-NEXT: v_mov_b32_e32 v7, s48
+; VI-NEXT: v_readlane_b32 s89, v62, 15
+; VI-NEXT: v_readlane_b32 s31, v62, 13
+; VI-NEXT: v_readlane_b32 s35, v62, 11
+; VI-NEXT: v_readlane_b32 s38, v62, 8
+; VI-NEXT: v_readlane_b32 s48, v62, 6
+; VI-NEXT: v_readlane_b32 s50, v62, 4
+; VI-NEXT: v_readlane_b32 s52, v62, 2
+; VI-NEXT: v_readlane_b32 s54, v62, 0
+; VI-NEXT: v_mov_b32_e32 v35, s44
+; VI-NEXT: v_mov_b32_e32 v33, s45
+; VI-NEXT: v_mov_b32_e32 v9, s47
+; VI-NEXT: v_mov_b32_e32 v34, s56
+; VI-NEXT: v_mov_b32_e32 v52, s46
+; VI-NEXT: v_mov_b32_e32 v31, s57
+; VI-NEXT: v_mov_b32_e32 v12, s43
+; VI-NEXT: v_mov_b32_e32 v13, s42
+; VI-NEXT: v_mov_b32_e32 v32, s58
+; VI-NEXT: v_mov_b32_e32 v42, s41
+; VI-NEXT: v_mov_b32_e32 v29, s59
+; VI-NEXT: v_mov_b32_e32 v40, s40
+; VI-NEXT: v_mov_b32_e32 v41, s15
+; VI-NEXT: v_mov_b32_e32 v30, s60
+; VI-NEXT: v_mov_b32_e32 v57, s14
+; VI-NEXT: v_mov_b32_e32 v27, s61
+; VI-NEXT: v_mov_b32_e32 v46, s83
+; VI-NEXT: v_mov_b32_e32 v47, s82
+; VI-NEXT: v_mov_b32_e32 v28, s62
+; VI-NEXT: v_mov_b32_e32 v61, s81
+; VI-NEXT: v_mov_b32_e32 v25, s63
+; VI-NEXT: v_mov_b32_e32 v60, s13
+; VI-NEXT: v_mov_b32_e32 v59, s12
+; VI-NEXT: v_mov_b32_e32 v26, s72
+; VI-NEXT: v_mov_b32_e32 v44, s80
+; VI-NEXT: v_mov_b32_e32 v37, s71
+; VI-NEXT: v_mov_b32_e32 v54, s70
+; VI-NEXT: v_mov_b32_e32 v36, s11
+; VI-NEXT: v_mov_b32_e32 v51, s10
+; VI-NEXT: v_mov_b32_e32 v48, s69
+; VI-NEXT: v_mov_b32_e32 v22, s68
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v20, s67
+; VI-NEXT: v_mov_b32_e32 v21, s66
+; VI-NEXT: v_mov_b32_e32 v18, s9
+; VI-NEXT: v_mov_b32_e32 v39, s65
+; VI-NEXT: v_mov_b32_e32 v50, s64
+; VI-NEXT: v_mov_b32_e32 v49, s8
+; VI-NEXT: v_mov_b32_e32 v14, s87
+; VI-NEXT: v_mov_b32_e32 v53, s86
+; VI-NEXT: v_mov_b32_e32 v45, s7
+; VI-NEXT: v_mov_b32_e32 v43, s85
+; VI-NEXT: v_mov_b32_e32 v58, s6
+; VI-NEXT: v_mov_b32_e32 v56, s84
+; VI-NEXT: v_mov_b32_e32 v1, s78
+; VI-NEXT: v_mov_b32_e32 v3, s90
+; VI-NEXT: v_mov_b32_e32 v6, s36
+; VI-NEXT: v_readlane_b32 s42, v62, 18
+; VI-NEXT: v_readlane_b32 s43, v62, 19
+; VI-NEXT: v_readlane_b32 s76, v62, 20
+; VI-NEXT: v_readlane_b32 s44, v62, 21
+; VI-NEXT: v_readlane_b32 s75, v62, 22
+; VI-NEXT: v_readlane_b32 s45, v62, 23
+; VI-NEXT: v_readlane_b32 s74, v62, 24
+; VI-NEXT: v_readlane_b32 s46, v62, 25
+; VI-NEXT: v_readlane_b32 s40, v62, 26
+; VI-NEXT: v_readlane_b32 s73, v62, 27
+; VI-NEXT: v_readlane_b32 s41, v62, 28
+; VI-NEXT: v_readlane_b32 s72, v62, 29
+; VI-NEXT: v_readlane_b32 s47, v62, 30
+; VI-NEXT: v_readlane_b32 s14, v62, 31
+; VI-NEXT: v_readlane_b32 s63, v62, 32
+; VI-NEXT: v_readlane_b32 s62, v62, 33
+; VI-NEXT: v_readlane_b32 s15, v62, 34
+; VI-NEXT: v_readlane_b32 s61, v62, 35
+; VI-NEXT: v_readlane_b32 s56, v62, 36
+; VI-NEXT: v_readlane_b32 s12, v62, 37
+; VI-NEXT: v_readlane_b32 s60, v62, 38
+; VI-NEXT: v_readlane_b32 s13, v62, 39
+; VI-NEXT: v_readlane_b32 s59, v62, 40
+; VI-NEXT: v_readlane_b32 s57, v62, 41
+; VI-NEXT: v_readlane_b32 s10, v62, 42
+; VI-NEXT: v_readlane_b32 s58, v62, 43
+; VI-NEXT: v_readlane_b32 s11, v62, 44
+; VI-NEXT: v_readlane_b32 s77, v62, 45
+; VI-NEXT: v_readlane_b32 s89, v62, 46
+; VI-NEXT: v_readlane_b32 s90, v62, 47
+; VI-NEXT: v_readlane_b32 s91, v62, 48
+; VI-NEXT: v_readlane_b32 vcc_lo, v62, 49
+; VI-NEXT: v_readlane_b32 s9, v62, 50
+; VI-NEXT: v_readlane_b32 vcc_hi, v62, 51
+; VI-NEXT: v_readlane_b32 s31, v62, 52
+; VI-NEXT: v_readlane_b32 s35, v62, 53
+; VI-NEXT: v_readlane_b32 s36, v62, 54
+; VI-NEXT: v_readlane_b32 s39, v62, 9
+; VI-NEXT: v_readlane_b32 s37, v62, 55
+; VI-NEXT: v_readlane_b32 s78, v62, 56
+; VI-NEXT: v_readlane_b32 s8, v62, 57
+; VI-NEXT: v_readlane_b32 s49, v62, 7
+; VI-NEXT: v_readlane_b32 s51, v62, 5
+; VI-NEXT: v_readlane_b32 s53, v62, 3
+; VI-NEXT: v_readlane_b32 s55, v62, 1
; VI-NEXT: .LBB91_5: ; %end
-; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_and_b32 s6, s16, 0xff
+; VI-NEXT: s_lshl_b32 s7, vcc_hi, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s8, 0xff
+; VI-NEXT: s_lshl_b32 s8, s54, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: v_mov_b32_e32 v23, s6
+; VI-NEXT: s_and_b32 s6, s17, 0xff
+; VI-NEXT: s_lshl_b32 s7, vcc_lo, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s78, 0xff
+; VI-NEXT: s_lshl_b32 s8, s37, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s18, 0xff
+; VI-NEXT: s_lshl_b32 s7, s90, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s36, 0xff
+; VI-NEXT: s_lshl_b32 s8, s52, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen
+; VI-NEXT: v_add_u32_e32 v23, vcc, 4, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s19, 0xff
+; VI-NEXT: s_lshl_b32 s7, s77, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s35, 0xff
+; VI-NEXT: s_lshl_b32 s8, s31, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 8, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s20, 0xff
+; VI-NEXT: s_lshl_b32 s7, s58, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s9, 0xff
+; VI-NEXT: s_lshl_b32 s8, s50, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 12, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s21, 0xff
+; VI-NEXT: s_lshl_b32 s7, s59, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s91, 0xff
+; VI-NEXT: s_lshl_b32 s8, s89, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 16, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s22, 0xff
+; VI-NEXT: s_lshl_b32 s7, s60, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s11, 0xff
+; VI-NEXT: s_lshl_b32 s8, s48, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 20, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s23, 0xff
+; VI-NEXT: s_lshl_b32 s7, s61, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s10, 0xff
+; VI-NEXT: s_lshl_b32 s8, s57, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 24, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s24, 0xff
+; VI-NEXT: s_lshl_b32 s7, s62, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s13, 0xff
+; VI-NEXT: s_lshl_b32 s8, s38, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 28, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s25, 0xff
+; VI-NEXT: s_lshl_b32 s7, s63, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s12, 0xff
+; VI-NEXT: s_lshl_b32 s8, s56, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 32, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s26, 0xff
+; VI-NEXT: s_lshl_b32 s7, s72, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s15, 0xff
+; VI-NEXT: s_lshl_b32 s8, s34, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 36, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s27, 0xff
+; VI-NEXT: s_lshl_b32 s7, s73, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s14, 0xff
+; VI-NEXT: s_lshl_b32 s8, s47, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 40, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s28, 0xff
+; VI-NEXT: s_lshl_b32 s7, s74, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s41, 0xff
+; VI-NEXT: s_lshl_b32 s8, s30, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 44, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s6, s29, 0xff
+; VI-NEXT: s_lshl_b32 s7, s75, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s7, s40, 0xff
+; VI-NEXT: s_lshl_b32 s8, s46, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s6, s6, 0xffff
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 48, v0
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s6
+; VI-NEXT: s_and_b32 s4, s4, 0xff
+; VI-NEXT: s_lshl_b32 s6, s76, 8
+; VI-NEXT: s_or_b32 s4, s4, s6
+; VI-NEXT: s_and_b32 s6, s45, 0xff
+; VI-NEXT: s_lshl_b32 s7, s88, 8
+; VI-NEXT: s_or_b32 s6, s6, s7
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_lshl_b32 s6, s6, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 52, v0
+; VI-NEXT: s_or_b32 s4, s4, s6
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v24, s4
+; VI-NEXT: s_and_b32 s4, s5, 0xff
+; VI-NEXT: s_lshl_b32 s5, s42, 8
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: s_and_b32 s5, s44, 0xff
+; VI-NEXT: s_lshl_b32 s6, s43, 8
+; VI-NEXT: s_or_b32 s5, s5, s6
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: v_add_u32_e32 v23, vcc, 56, v0
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: v_add_u32_e32 v23, vcc, 60, v0
+; VI-NEXT: v_mov_b32_e32 v24, s4
+; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v58
+; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; VI-NEXT: v_or_b32_sdwa v7, v22, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v22, vcc, 64, v0
+; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20
+; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v10
+; VI-NEXT: v_or_b32_sdwa v10, v36, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; VI-NEXT: v_or_b32_sdwa v6, v44, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; VI-NEXT: v_or_b32_sdwa v5, v61, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_readlane_b32 s87, v63, 31
; VI-NEXT: v_readlane_b32 s86, v63, 30
; VI-NEXT: v_readlane_b32 s85, v63, 29
@@ -165782,374 +167207,105 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_readlane_b32 s31, v63, 1
; VI-NEXT: v_readlane_b32 s30, v63, 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 4, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v17, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 20, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v41
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 24, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v17, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 28, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 32, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v17, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 36, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 40, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v17, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 44, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 48, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v17, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 52, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 56, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v17, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0
-; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0
-; VI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v56
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v7, v22, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v7, v7, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v20, vcc, 0x44, v0
+; VI-NEXT: buffer_store_dword v7, v20, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v45
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v7, v20, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v10, vcc, 0x48, v0
+; VI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v43
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v51
+; VI-NEXT: v_or_b32_sdwa v10, v48, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v10, vcc, 0x4c, v0
+; VI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v55
+; VI-NEXT: v_or_b32_sdwa v7, v26, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x50, v0
+; VI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v53
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v37
+; VI-NEXT: v_or_b32_sdwa v7, v54, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v7, vcc, 0x54, v0
+; VI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v15
+; VI-NEXT: v_or_b32_sdwa v6, v28, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v6, vcc, 0x58, v0
+; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v14
+; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v60
+; VI-NEXT: v_or_b32_sdwa v5, v25, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v6, v59, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v6, vcc, 0x5c, v0
+; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v50
+; VI-NEXT: v_or_b32_sdwa v5, v30, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x60, v0
+; VI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v49
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v46
+; VI-NEXT: v_or_b32_sdwa v4, v27, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v47, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x64, v0
+; VI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v39
+; VI-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x68, v0
+; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v38
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v40
+; VI-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v41, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x6c, v0
+; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v16
+; VI-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x70, v0
+; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12
+; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0
+; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v18
+; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42
-; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45
-; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v59
-; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35
-; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57
-; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44
-; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53
-; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43
-; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50
-; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v8
+; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -166166,8 +167322,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; VI-NEXT: s_mov_b64 exec, s[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -216125,74 +217281,74 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v63, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v0, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v0, v1, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v0, v61, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v0, v60, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v0, v59, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v0, v58, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v0, v57, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v0, v56, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v0, v47, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v0, v46, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v0, v45, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v0, v44, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v0, v43, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v0, v42, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v16, v55, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v16, v54, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v16, v53, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v16, v52, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v16, v51, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v16, v50, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v16, v49, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v16, v48, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v16, v39, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v16, v38, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v16, v37, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v16, v36, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v16, v35, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v18
-; VI-NEXT: v_alignbit_b32 v18, v16, v34, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v32
-; VI-NEXT: v_alignbit_b32 v1, v0, v41, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v16, v33, 16
+; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_or_b32_sdwa v15, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14
+; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13
+; VI-NEXT: v_or_b32_sdwa v13, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12
+; VI-NEXT: v_or_b32_sdwa v12, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
+; VI-NEXT: v_or_b32_sdwa v11, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
+; VI-NEXT: v_or_b32_sdwa v10, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9
+; VI-NEXT: v_or_b32_sdwa v9, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
+; VI-NEXT: v_or_b32_sdwa v8, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
+; VI-NEXT: v_or_b32_sdwa v7, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; VI-NEXT: v_or_b32_sdwa v6, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; VI-NEXT: v_or_b32_sdwa v5, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; VI-NEXT: v_or_b32_sdwa v4, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v3, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; VI-NEXT: v_or_b32_sdwa v2, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v31
+; VI-NEXT: v_or_b32_sdwa v31, v55, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v30
+; VI-NEXT: v_or_b32_sdwa v30, v54, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29
+; VI-NEXT: v_or_b32_sdwa v29, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28
+; VI-NEXT: v_or_b32_sdwa v28, v52, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27
+; VI-NEXT: v_or_b32_sdwa v27, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26
+; VI-NEXT: v_or_b32_sdwa v26, v50, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v25
+; VI-NEXT: v_or_b32_sdwa v25, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v24
+; VI-NEXT: v_or_b32_sdwa v24, v48, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v23
+; VI-NEXT: v_or_b32_sdwa v23, v39, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v22
+; VI-NEXT: v_or_b32_sdwa v22, v38, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v21
+; VI-NEXT: v_or_b32_sdwa v21, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v20
+; VI-NEXT: v_or_b32_sdwa v20, v36, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v19
+; VI-NEXT: v_or_b32_sdwa v19, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v18, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v1, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17
+; VI-NEXT: v_or_b32_sdwa v17, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT: v_alignbit_b32 v0, v0, v40, 16
+; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16
+; VI-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB100_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -219070,19 +220226,19 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s29, 16
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v14, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16
+; VI-NEXT: v_or_b32_sdwa v15, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
@@ -219094,9 +220250,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v13, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219112,9 +220268,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v12, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219130,9 +220286,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v11, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219148,9 +220304,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219166,9 +220322,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v9, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219184,9 +220340,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v8, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219202,9 +220358,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219220,9 +220376,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219238,9 +220394,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219256,9 +220412,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219274,9 +220430,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -219292,9 +220448,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v18, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1
@@ -219310,9 +220466,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16
+; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v18, s4, v0
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
@@ -219328,8 +220484,8 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
@@ -219586,38 +220742,38 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31
; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16
-; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16
-; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16
-; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16
-; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16
-; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16
-; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16
-; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16
-; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16
-; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16
-; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16
-; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16
-; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16
-; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16
-; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16
-; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_or_b32_sdwa v31, v55, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v30, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v29, v53, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v28, v52, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v27, v51, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v26, v50, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v25, v49, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v24, v48, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v23, v39, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v22, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v20, v36, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v19, v35, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v32, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_branch .LBB101_5
; VI-NEXT: .LBB101_3:
; VI-NEXT: s_branch .LBB101_2
@@ -221085,22 +222241,141 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
@@ -221119,21 +222394,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
@@ -221147,21 +222407,37 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
; SI-NEXT: v_cvt_f16_f32_e32 v55, v55
; SI-NEXT: v_cvt_f16_f32_e32 v40, v40
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
; SI-NEXT: v_cvt_f16_f32_e32 v43, v43
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f16_f32_e32 v45, v45
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
; SI-NEXT: v_cvt_f16_f32_e32 v47, v47
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v56, v1
; SI-NEXT: s_waitcnt vmcnt(5)
@@ -221180,407 +222456,286 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v62, v62
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v1
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v2
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; kill: killed $vgpr2
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB102_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v60
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr19
; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: ; implicit-def: $vgpr28
; SI-NEXT: ; implicit-def: $vgpr29
; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr40
; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: .LBB102_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB102_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v2, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v63, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v63, v63
; SI-NEXT: v_cvt_f32_f16_e32 v1, v62
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v63
; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v61
@@ -221646,35 +222801,30 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v58, v29
; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v28
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32
; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33
; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v58
; SI-NEXT: v_cvt_f32_f16_e32 v58, v25
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1
; SI-NEXT: v_cvt_f32_f16_e32 v1, v24
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v26
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27
-; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28
-; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
@@ -221701,59 +222851,62 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v26
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
+; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28
+; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v28
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v7
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v5
; SI-NEXT: v_cvt_f16_f32_e32 v4, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v7
; SI-NEXT: v_cvt_f16_f32_e32 v6, v8
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v9
; SI-NEXT: v_cvt_f16_f32_e32 v4, v10
; SI-NEXT: v_cvt_f16_f32_e32 v5, v11
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v12
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
@@ -221766,10 +222919,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
@@ -221782,10 +222935,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
@@ -221793,23 +222946,25 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v3, v21
; SI-NEXT: v_cvt_f16_f32_e32 v4, v22
; SI-NEXT: v_cvt_f16_f32_e32 v5, v23
+; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v27
; SI-NEXT: v_cvt_f16_f32_e32 v4, v26
; SI-NEXT: v_cvt_f16_f32_e32 v5, v25
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
@@ -221822,10 +222977,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
@@ -221838,10 +222993,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
@@ -221854,10 +223009,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -221870,10 +223025,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
@@ -221886,10 +223041,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
@@ -221902,10 +223057,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
@@ -221918,10 +223073,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
@@ -221933,388 +223088,414 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f32_f16_e32 v58, v58
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v63
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v58
; SI-NEXT: v_cvt_f16_f32_e32 v2, v62
-; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58
; SI-NEXT: v_cvt_f16_f32_e32 v3, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v58
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v2, v58
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v3, v24
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_mov_b32_e32 v4, v7
; SI-NEXT: .LBB102_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -222594,629 +223775,499 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32
-; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36
+; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40
+; SI-NEXT: s_waitcnt expcnt(6)
+; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44
+; SI-NEXT: s_waitcnt expcnt(5)
+; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48
+; SI-NEXT: s_waitcnt expcnt(4)
+; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v8
-; SI-NEXT: v_mov_b32_e32 v46, v26
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v2
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v2
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v45, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v26, v25
-; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
-; SI-NEXT: v_cvt_f16_f32_e32 v47, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v57, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v8, s16
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v26
+; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v41, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v7, s16
; SI-NEXT: v_cvt_f16_f32_e32 v24, s18
-; SI-NEXT: v_cvt_f16_f32_e32 v25, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v29, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v26, s19
+; SI-NEXT: v_cvt_f16_f32_e32 v28, s20
; SI-NEXT: v_cvt_f16_f32_e32 v30, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v27, s24
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v31, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT: v_cvt_f16_f32_e32 v59, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v38
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v31, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v20
+; SI-NEXT: v_cvt_f16_f32_e32 v20, v23
+; SI-NEXT: v_cvt_f16_f32_e32 v42, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v43, v33
+; SI-NEXT: v_cvt_f16_f32_e32 v44, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v45, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v46, v49
+; SI-NEXT: s_waitcnt vmcnt(13) expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v48
-; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_cvt_f16_f32_e32 v47, v47
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v57, v57
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v58
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v55
+; SI-NEXT: v_cvt_f16_f32_e32 v59, v59
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v60
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v61
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cvt_f16_f32_e32 v62, v62
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v63, v63
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v36
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v44, v51
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v39
; SI-NEXT: v_cvt_f16_f32_e32 v23, s17
-; SI-NEXT: v_cvt_f16_f32_e32 v38, s22
-; SI-NEXT: v_cvt_f16_f32_e32 v37, s23
-; SI-NEXT: v_cvt_f16_f32_e32 v48, s25
-; SI-NEXT: v_cvt_f16_f32_e32 v49, s26
-; SI-NEXT: v_cvt_f16_f32_e32 v35, s27
-; SI-NEXT: v_cvt_f16_f32_e32 v50, s28
-; SI-NEXT: v_cvt_f16_f32_e32 v51, s29
+; SI-NEXT: v_cvt_f16_f32_e32 v32, s22
+; SI-NEXT: v_cvt_f16_f32_e32 v33, s23
+; SI-NEXT: v_cvt_f16_f32_e32 v34, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v35, s25
+; SI-NEXT: v_cvt_f16_f32_e32 v36, s26
+; SI-NEXT: v_cvt_f16_f32_e32 v39, s27
+; SI-NEXT: v_cvt_f16_f32_e32 v49, s28
+; SI-NEXT: v_cvt_f16_f32_e32 v52, s29
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_cbranch_scc0 .LBB103_2
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: s_cbranch_scc0 .LBB103_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v35
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52
+; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43
-; SI-NEXT: v_mov_b32_e32 v43, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28
-; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_mov_b32_e32 v50, v19
-; SI-NEXT: v_mov_b32_e32 v51, v22
-; SI-NEXT: v_mov_b32_e32 v38, v16
-; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_mov_b32_e32 v37, v45
-; SI-NEXT: v_mov_b32_e32 v27, v26
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26
-; SI-NEXT: v_mov_b32_e32 v49, v47
-; SI-NEXT: v_mov_b32_e32 v35, v28
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v59
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53
-; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v54
-; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41
-; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v62
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v41
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; SI-NEXT: v_mov_b32_e32 v32, v19
+; SI-NEXT: v_mov_b32_e32 v33, v15
+; SI-NEXT: v_mov_b32_e32 v35, v21
+; SI-NEXT: v_mov_b32_e32 v28, v22
+; SI-NEXT: v_mov_b32_e32 v39, v54
+; SI-NEXT: v_mov_b32_e32 v36, v43
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43
+; SI-NEXT: v_mov_b32_e32 v43, v7
+; SI-NEXT: v_mov_b32_e32 v49, v44
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v44
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45
+; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v37
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v38
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v46
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v58
+; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v61
+; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25
; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57
-; SI-NEXT: v_mov_b32_e32 v57, v5
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32
-; SI-NEXT: v_mov_b32_e32 v32, v7
-; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56
-; SI-NEXT: v_mov_b32_e32 v33, v12
-; SI-NEXT: v_mov_b32_e32 v34, v5
-; SI-NEXT: v_mov_b32_e32 v58, v7
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v61
-; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v63
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v44
-; SI-NEXT: v_mov_b32_e32 v44, v18
-; SI-NEXT: v_mov_b32_e32 v5, v43
-; SI-NEXT: v_mov_b32_e32 v18, v6
-; SI-NEXT: s_branch .LBB103_3
-; SI-NEXT: .LBB103_2:
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: v_mov_b32_e32 v35, v28
-; SI-NEXT: v_mov_b32_e32 v49, v47
-; SI-NEXT: v_mov_b32_e32 v27, v26
-; SI-NEXT: v_mov_b32_e32 v37, v45
-; SI-NEXT: v_mov_b32_e32 v38, v16
-; SI-NEXT: v_mov_b32_e32 v51, v22
-; SI-NEXT: v_mov_b32_e32 v50, v19
-; SI-NEXT: s_mov_b64 s[4:5], -1
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: v_mov_b32_e32 v5, v6
-; SI-NEXT: ; kill: killed $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; kill: killed $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: .LBB103_3: ; %Flow
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_mov_b32_e32 v36, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v43, v9
-; SI-NEXT: v_mov_b32_e32 v12, v31
-; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT: v_mov_b32_e32 v31, v11
-; SI-NEXT: v_mov_b32_e32 v9, v17
-; SI-NEXT: s_cbranch_vccnz .LBB103_5
-; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v54
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29
+; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v59
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60
+; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v62
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2
+; SI-NEXT: v_mov_b32_e32 v44, v52
+; SI-NEXT: s_cbranch_execnz .LBB103_3
+; SI-NEXT: .LBB103_2: ; %cmp.true
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v36
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v63
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v13
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v62
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v8
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v61
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v7
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v9
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v55
+; SI-NEXT: v_cvt_f32_f16_e32 v48, v33
+; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v25
+; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v32
+; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v35
+; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v3
+; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v48
+; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v50
+; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v59
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v58
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v55
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v7
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v42
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v9
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v61
-; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v41
-; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15
-; SI-NEXT: v_mov_b32_e32 v6, v37
-; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v39
-; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v52
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v53
+; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v45
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v56
+; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v47
+; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v7
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v46
+; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v40
+; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v34, v29
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v39
; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v51
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v60
-; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v50
-; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v13
-; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v43
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v18
-; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45
-; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43
-; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v20, v49
-; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v49
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v7
+; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v41
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v44
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v5
+; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v34
+; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v36, v20
+; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
+; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36
+; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
+; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v56
-; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v27
-; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v35
-; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v38
-; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v46
-; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
+; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v32, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32
+; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v34, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v36, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36
+; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
+; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40
+; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
+; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v57, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44
+; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v58, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46
+; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v59, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47
+; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v60, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
+; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v57, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v61, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v58, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58
+; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v26, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v29, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
+; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v35, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v12, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v32, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v59, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59
-; SI-NEXT: v_cvt_f16_f32_e32 v59, v59
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v60, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v61, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v61
+; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v62, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62
; SI-NEXT: v_cvt_f16_f32_e32 v62, v62
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v63, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63
; SI-NEXT: v_cvt_f16_f32_e32 v63, v63
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -223232,482 +224283,531 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v60
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v35
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v27
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v27, v49
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v19
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v53
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v61
+; SI-NEXT: v_cvt_f16_f32_e32 v27, v60
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v26
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v58
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v57
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v58
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v57
+; SI-NEXT: v_cvt_f16_f32_e32 v27, v56
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v47
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v45
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v46
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v47
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v46
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v45
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v44
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v43
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v42
; SI-NEXT: v_cvt_f16_f32_e32 v12, v41
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v40
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v51
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v50
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v55
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v51
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v34
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v33
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v32
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v31
-; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v8
+; SI-NEXT: v_mov_b32_e32 v51, v20
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v6
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v36
; SI-NEXT: v_cvt_f16_f32_e32 v4, v16
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v21
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v20
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v17
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v39
-; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v52
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v38
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v9
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v7
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v34
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v24
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v17
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v30
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v22
+; SI-NEXT: v_mov_b32_e32 v16, v12
+; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v27
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v23
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v15
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v48
-; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v31
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v26
+; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v18
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v24
-; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v14
-; SI-NEXT: v_mov_b32_e32 v16, v6
-; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v37
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v25
-; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v55
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v30
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v54
-; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v2
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; SI-NEXT: v_mov_b32_e32 v6, v19
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v19, v15
+; SI-NEXT: v_mov_b32_e32 v15, v20
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; SI-NEXT: v_mov_b32_e32 v4, v27
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; SI-NEXT: v_mov_b32_e32 v4, v25
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v3, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3
; SI-NEXT: v_mov_b32_e32 v3, v13
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; SI-NEXT: .LBB103_5: ; %end
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2
+; SI-NEXT: .LBB103_3: ; %end
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
@@ -223728,6 +224828,116 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
+; SI-NEXT: .LBB103_4:
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v49, v44
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: v_mov_b32_e32 v36, v43
+; SI-NEXT: v_mov_b32_e32 v39, v54
+; SI-NEXT: v_mov_b32_e32 v28, v22
+; SI-NEXT: v_mov_b32_e32 v35, v21
+; SI-NEXT: v_mov_b32_e32 v33, v15
+; SI-NEXT: v_mov_b32_e32 v32, v19
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; kill: killed $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; kill: killed $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: s_branch .LBB103_2
;
; VI-LABEL: bitcast_v64f16_to_v64bf16_scalar:
; VI: ; %bb.0:
@@ -224058,313 +225268,537 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
+; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40
-; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44
-; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52
-; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56
-; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60
-; SI-NEXT: s_waitcnt expcnt(6)
-; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64
-; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48
+; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76
+; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84
-; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92
-; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v13, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: v_mul_f32_e32 v9, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v59, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v28
+; SI-NEXT: ; kill: killed $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; kill: killed $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr19
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT: v_mul_f32_e32 v61, 1.0, v32
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v33
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v37
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v44
-; SI-NEXT: v_mul_f32_e32 v44, 1.0, v46
-; SI-NEXT: v_mul_f32_e32 v9, 1.0, v38
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v49
-; SI-NEXT: v_mul_f32_e32 v7, 1.0, v50
-; SI-NEXT: v_mul_f32_e32 v46, 1.0, v58
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v59
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v51
-; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v42
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v45
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; kill: killed $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v47
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v57
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60
-; SI-NEXT: v_mul_f32_e32 v57, 1.0, v62
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v63
-; SI-NEXT: ; kill: killed $vgpr45
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120
+; SI-NEXT: v_mul_f32_e32 v15, 1.0, v47
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v15, 1.0, v43
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v63, 1.0, v38
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: v_mul_f32_e32 v62, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v51
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v52
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v55
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v41
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v45
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46
+; SI-NEXT: v_mul_f32_e32 v7, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v40
+; SI-NEXT: ; kill: killed $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; kill: killed $vgpr38
; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; kill: killed $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; kill: killed $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v2
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_mul_f32_e32 v58, 1.0, v3
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v13
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v3
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v34
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; kill: killed $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB104_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v33
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; kill: killed $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; kill: killed $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v56
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v58
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v59
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v61
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v63
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v27
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; kill: killed $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; kill: killed $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v62
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; kill: killed $vgpr22
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; kill: killed $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; kill: killed $vgpr26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; kill: killed $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -224376,834 +225810,677 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v56
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v40
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v32
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v13
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v54
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11
-; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: .LBB104_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB104_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v33
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v43
-; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33
-; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32
-; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v31
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31
-; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16
-; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v53
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v54
-; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v34
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v51
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; SI-NEXT: v_alignbit_b32 v15, v22, v15, 16
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12
-; SI-NEXT: v_alignbit_b32 v15, v19, v15, 16
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v26
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v3, v3, v10
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15
-; SI-NEXT: v_alignbit_b32 v13, v18, v13, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT: v_alignbit_b32 v10, v13, v10, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v21
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v39, v10, v11
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v20
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16
+; SI-NEXT: v_or_b32_e32 v29, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_or_b32_e32 v30, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v31
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v34, v10, v11
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v62, v10, v11
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v61
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v37, v10, v11
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v59
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v58
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v57
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v56
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v18
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v18
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_or_b32_e32 v13, v8, v9
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6
-; SI-NEXT: v_alignbit_b32 v4, v8, v4, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v27
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_alignbit_b32 v4, v10, v4, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v24
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v61, v1, v3, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v58
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v19, v10, v3, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v4, v13, v4, 16
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v14, v8, v9
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v24
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v15, v8, v15
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v25
+; SI-NEXT: v_alignbit_b32 v46, v15, v27, 16
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v16, v8, v16
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v45, v16, v26, 16
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v25
+; SI-NEXT: v_alignbit_b32 v44, v14, v31, 16
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v26
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v26
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v20, v10, v8, 16
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v27
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v25
+; SI-NEXT: v_alignbit_b32 v42, v12, v33, 16
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_alignbit_b32 v4, v16, v4, 16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v21
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v21, v10, v8, 16
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v31
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v44
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v28
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v25
+; SI-NEXT: v_alignbit_b32 v41, v11, v38, 16
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v23, v10, v8, 16
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v32
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v25
+; SI-NEXT: v_alignbit_b32 v55, v10, v48, 16
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v33
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v18, v16, v4, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16
-; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16
-; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v25
+; SI-NEXT: v_alignbit_b32 v54, v37, v49, 16
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v38
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v24, v10, v8, 16
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v25
+; SI-NEXT: v_alignbit_b32 v53, v62, v50, 16
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v48
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; SI-NEXT: v_alignbit_b32 v52, v34, v51, 16
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33
-; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v49
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v50
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v25
+; SI-NEXT: v_alignbit_b32 v50, v29, v43, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v51
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v51, v30, v40, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_alignbit_b32 v37, v38, v11, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v11, v37, v11, 16
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_alignbit_b32 v39, v48, v12, 16
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_alignbit_b32 v35, v36, v10, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v12, v39, v12, 16
-; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v40
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v43
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v25
+; SI-NEXT: v_alignbit_b32 v49, v39, v47, 16
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v47
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_alignbit_b32 v49, v50, v13, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v51
-; SI-NEXT: v_alignbit_b32 v13, v49, v13, 16
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_alignbit_b32 v51, v52, v14, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v31
-; SI-NEXT: v_alignbit_b32 v14, v51, v14, 16
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v56
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v25
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v48, v43, v57, 16
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_alignbit_b32 v2, v47, v56, 16
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_alignbit_b32 v41, v42, v15, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v57
; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT: .LBB104_4: ; %end
-; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v25
+; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3
+; SI-NEXT: v_alignbit_b32 v3, v13, v32, 16
+; SI-NEXT: v_alignbit_b32 v38, v40, v58, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v58
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v18
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17
+; SI-NEXT: v_mov_b32_e32 v17, v39
+; SI-NEXT: v_mov_b32_e32 v39, v62
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT: .LBB104_4: ; %end
+; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v45
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v4
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36
+; SI-NEXT: v_or_b32_e32 v1, v1, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35
+; SI-NEXT: v_or_b32_e32 v1, v1, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
+; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28
+; SI-NEXT: v_or_b32_e32 v1, v1, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
@@ -225771,74 +227048,74 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v63, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v0, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v0, v1, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v0, v61, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v0, v60, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v0, v59, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v0, v58, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v0, v57, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v0, v56, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v0, v47, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v0, v46, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v0, v45, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v0, v44, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v0, v43, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v0, v42, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v31
-; VI-NEXT: v_alignbit_b32 v31, v16, v55, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v30
-; VI-NEXT: v_alignbit_b32 v30, v16, v54, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v29
-; VI-NEXT: v_alignbit_b32 v29, v16, v53, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v28
-; VI-NEXT: v_alignbit_b32 v28, v16, v52, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v27
-; VI-NEXT: v_alignbit_b32 v27, v16, v51, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v26
-; VI-NEXT: v_alignbit_b32 v26, v16, v50, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v25
-; VI-NEXT: v_alignbit_b32 v25, v16, v49, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v24
-; VI-NEXT: v_alignbit_b32 v24, v16, v48, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v23
-; VI-NEXT: v_alignbit_b32 v23, v16, v39, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v22
-; VI-NEXT: v_alignbit_b32 v22, v16, v38, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v21
-; VI-NEXT: v_alignbit_b32 v21, v16, v37, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v20
-; VI-NEXT: v_alignbit_b32 v20, v16, v36, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v19
-; VI-NEXT: v_alignbit_b32 v19, v16, v35, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v18
-; VI-NEXT: v_alignbit_b32 v18, v16, v34, 16
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v32
-; VI-NEXT: v_alignbit_b32 v1, v0, v41, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v17
-; VI-NEXT: v_alignbit_b32 v17, v16, v33, 16
+; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_or_b32_sdwa v15, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14
+; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13
+; VI-NEXT: v_or_b32_sdwa v13, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12
+; VI-NEXT: v_or_b32_sdwa v12, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
+; VI-NEXT: v_or_b32_sdwa v11, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
+; VI-NEXT: v_or_b32_sdwa v10, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9
+; VI-NEXT: v_or_b32_sdwa v9, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
+; VI-NEXT: v_or_b32_sdwa v8, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
+; VI-NEXT: v_or_b32_sdwa v7, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; VI-NEXT: v_or_b32_sdwa v6, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; VI-NEXT: v_or_b32_sdwa v5, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; VI-NEXT: v_or_b32_sdwa v4, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v3, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; VI-NEXT: v_or_b32_sdwa v2, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v31
+; VI-NEXT: v_or_b32_sdwa v31, v55, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v30
+; VI-NEXT: v_or_b32_sdwa v30, v54, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29
+; VI-NEXT: v_or_b32_sdwa v29, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28
+; VI-NEXT: v_or_b32_sdwa v28, v52, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27
+; VI-NEXT: v_or_b32_sdwa v27, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26
+; VI-NEXT: v_or_b32_sdwa v26, v50, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v25
+; VI-NEXT: v_or_b32_sdwa v25, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v24
+; VI-NEXT: v_or_b32_sdwa v24, v48, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v23
+; VI-NEXT: v_or_b32_sdwa v23, v39, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v22
+; VI-NEXT: v_or_b32_sdwa v22, v38, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v21
+; VI-NEXT: v_or_b32_sdwa v21, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v20
+; VI-NEXT: v_or_b32_sdwa v20, v36, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v19
+; VI-NEXT: v_or_b32_sdwa v19, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18
+; VI-NEXT: v_or_b32_sdwa v18, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32
+; VI-NEXT: v_or_b32_sdwa v1, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17
+; VI-NEXT: v_or_b32_sdwa v17, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT: v_alignbit_b32 v0, v0, v40, 16
+; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16
+; VI-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB104_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -227562,284 +228839,308 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76
-; SI-NEXT: v_mul_f32_e32 v44, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v62, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v13, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v60, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v57, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v63, 1.0, v27
-; SI-NEXT: v_mul_f32_e32 v58, 1.0, v28
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v30
-; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v11, 1.0, s23
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v58, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v59, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v63, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v60, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v46, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v11, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v13, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v44, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v9, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v15, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v26
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v47, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v7, 1.0, v30
+; SI-NEXT: v_mul_f32_e64 v26, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v30, 1.0, s24
; SI-NEXT: v_mul_f32_e64 v29, 1.0, s25
-; SI-NEXT: v_mul_f32_e64 v14, 1.0, s27
-; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28
-; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v20, 1.0, s29
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v32
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v33
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v34
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v35
-; SI-NEXT: v_mul_f32_e32 v59, 1.0, v37
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v38
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v39
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v48
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v49
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: v_mul_f32_e32 v56, 1.0, v34
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v36
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v37
+; SI-NEXT: v_mul_f32_e32 v61, 1.0, v38
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v50
-; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v51
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v52
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v53
-; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v40
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v41
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v42
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: v_mul_f32_e64 v53, 1.0, s17
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v55
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v15, 1.0, s26
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v53
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v54
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v40
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v41
+; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v42
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v43
+; SI-NEXT: v_mul_f32_e64 v33, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v34, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v32, 1.0, s20
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v37, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB105_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v57
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v35
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58
+; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v44
-; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v36
+; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v36, v59
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v59
+; SI-NEXT: v_mov_b32_e32 v42, v50
+; SI-NEXT: v_mov_b32_e32 v50, v11
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v34
+; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v48
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62
+; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v38
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v38, v60
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v60
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v5
-; SI-NEXT: v_mov_b32_e32 v42, v62
-; SI-NEXT: v_mov_b32_e32 v43, v63
-; SI-NEXT: v_mov_b32_e32 v55, v12
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v25
-; SI-NEXT: v_mov_b32_e32 v25, v60
-; SI-NEXT: v_mov_b32_e32 v54, v47
-; SI-NEXT: v_mov_b32_e32 v40, v20
-; SI-NEXT: v_mov_b32_e32 v51, v61
+; SI-NEXT: v_mov_b32_e32 v28, v31
+; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v31
+; SI-NEXT: v_mov_b32_e32 v41, v24
+; SI-NEXT: v_mov_b32_e32 v43, v25
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_mov_b32_e32 v52, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46
-; SI-NEXT: v_mov_b32_e32 v29, v31
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v31
-; SI-NEXT: v_mov_b32_e32 v24, v56
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_mov_b32_e32 v52, v10
-; SI-NEXT: v_mov_b32_e32 v53, v59
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_mov_b32_e32 v53, v42
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57
-; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v55
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v40
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20
+; SI-NEXT: v_mov_b32_e32 v20, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v63
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v50, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v35, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62
-; SI-NEXT: v_mov_b32_e32 v62, v5
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v63
-; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v46
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v31, v3
+; SI-NEXT: v_mov_b32_e32 v3, v54
+; SI-NEXT: v_mov_b32_e32 v54, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59
-; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32
-; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v39
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_mov_b32_e32 v41, v1
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60
-; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47
-; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v33
-; SI-NEXT: v_mov_b32_e32 v33, v34
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v34
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v56
+; SI-NEXT: v_mov_b32_e32 v56, v33
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v61
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_mov_b32_e32 v48, v11
+; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v45
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56
-; SI-NEXT: v_mov_b32_e32 v39, v4
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v44
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v22
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25
+; SI-NEXT: v_mov_b32_e32 v25, v29
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v47
+; SI-NEXT: v_mov_b32_e32 v47, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v42
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v49
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v45
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58
-; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v35
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v48
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: v_mov_b32_e32 v37, v38
-; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; SI-NEXT: v_mov_b32_e32 v11, v45
+; SI-NEXT: v_mov_b32_e32 v45, v46
+; SI-NEXT: v_mov_b32_e32 v46, v62
+; SI-NEXT: v_mov_b32_e32 v62, v58
+; SI-NEXT: v_mov_b32_e32 v58, v26
+; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24
+; SI-NEXT: v_mov_b32_e32 v7, v24
+; SI-NEXT: v_mov_b32_e32 v24, v29
+; SI-NEXT: v_mov_b32_e32 v29, v15
+; SI-NEXT: v_mov_b32_e32 v15, v44
+; SI-NEXT: v_mov_b32_e32 v44, v63
; SI-NEXT: s_branch .LBB105_3
; SI-NEXT: .LBB105_2:
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v55, v12
-; SI-NEXT: v_mov_b32_e32 v33, v34
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: v_mov_b32_e32 v53, v50
+; SI-NEXT: v_mov_b32_e32 v50, v11
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: ; kill: killed $vgpr7
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -227871,674 +229172,716 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, v5
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: v_mov_b32_e32 v51, v61
-; SI-NEXT: v_mov_b32_e32 v42, v62
-; SI-NEXT: v_mov_b32_e32 v29, v31
-; SI-NEXT: v_mov_b32_e32 v25, v60
-; SI-NEXT: v_mov_b32_e32 v24, v56
-; SI-NEXT: v_mov_b32_e32 v54, v47
-; SI-NEXT: v_mov_b32_e32 v40, v20
-; SI-NEXT: v_mov_b32_e32 v43, v63
-; SI-NEXT: v_mov_b32_e32 v52, v10
-; SI-NEXT: v_mov_b32_e32 v53, v59
-; SI-NEXT: v_mov_b32_e32 v39, v4
-; SI-NEXT: v_mov_b32_e32 v37, v38
+; SI-NEXT: v_mov_b32_e32 v43, v25
+; SI-NEXT: v_mov_b32_e32 v41, v24
+; SI-NEXT: v_mov_b32_e32 v28, v31
+; SI-NEXT: v_mov_b32_e32 v54, v15
+; SI-NEXT: v_mov_b32_e32 v52, v13
+; SI-NEXT: v_mov_b32_e32 v38, v60
+; SI-NEXT: v_mov_b32_e32 v36, v59
+; SI-NEXT: v_mov_b32_e32 v20, v3
; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; kill: killed $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; kill: killed $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; kill: killed $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr61
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: .LBB105_3: ; %Flow
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v42, v53
+; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: s_cbranch_vccnz .LBB105_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v39
-; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v37
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34
-; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v17, v43
+; SI-NEXT: v_mov_b32_e32 v16, v41
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40
+; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v53
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49
+; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v40
+; SI-NEXT: v_or_b32_e32 v1, v3, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11
-; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v25
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v40
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v42
+; SI-NEXT: v_or_b32_e32 v1, v3, v5
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14
-; SI-NEXT: v_alignbit_b32 v51, v16, v1, 16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v54
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16
-; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v1, v3, v6
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v24
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v28
-; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31
-; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v2
-; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v9
-; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v18, v20, v1, 16
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v7
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20
-; SI-NEXT: v_alignbit_b32 v1, v22, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v1, v26, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v29
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v33
-; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43
+; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v1, v27, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v1, v3, v7
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v24
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v1, v28, v1, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v41, v3, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v22, v3, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v54
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v23, v3, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v45, v3, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v13
+; SI-NEXT: v_or_b32_e32 v28, v3, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v38
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v25, v3, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v47, v3, v7
+; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v13
+; SI-NEXT: v_mov_b32_e32 v13, v45
+; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12
+; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; SI-NEXT: v_mov_b32_e32 v60, v23
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v56, v3, v7
+; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v29
+; SI-NEXT: v_or_b32_e32 v4, v3, v7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v30
+; SI-NEXT: v_or_b32_e32 v7, v7, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v26, v11, v16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v52, v30, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v36, v35, v1, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v32
+; SI-NEXT: v_or_b32_e32 v1, v16, v34
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v32
-; SI-NEXT: v_alignbit_b32 v48, v49, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v31
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25
-; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v28, v59, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v26, v28, v26, 16
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_alignbit_b32 v46, v61, v31, 16
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v21, v30, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v32
+; SI-NEXT: v_or_b32_e32 v1, v16, v63
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v31
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v23, v10, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v63, v23, v27, 16
-; SI-NEXT: v_alignbit_b32 v27, v21, v12, 16
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v1
+; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v2, v32, v1
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_alignbit_b32 v57, v58, v1, 16
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v2
+; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35
+; SI-NEXT: v_or_b32_e32 v3, v33, v2
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v17, v1, v20, 16
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16
+; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v3
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v35
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24
-; SI-NEXT: v_alignbit_b32 v56, v47, v20, 16
-; SI-NEXT: v_alignbit_b32 v20, v62, v11, 16
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v16, v56, v16, 16
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v4
+; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v36
+; SI-NEXT: v_or_b32_e32 v5, v35, v4
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v57, v47, v4, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v6, v36, v5
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v59, v25, v5, 16
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v6
+; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
+; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; SI-NEXT: v_or_b32_e32 v11, v37, v6
+; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_mov_b32_e32 v37, v7
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v22, v45, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
-; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v29
-; SI-NEXT: v_alignbit_b32 v13, v60, v25, 16
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v39
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v11, v38, v17
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9
-; SI-NEXT: v_alignbit_b32 v24, v44, v3, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v48
+; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v11, v39, v16
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v42
+; SI-NEXT: v_alignbit_b32 v9, v23, v16, 16
+; SI-NEXT: v_mov_b32_e32 v23, v22
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49
+; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v49
+; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v11, v48, v18
+; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v9, v11, v9, 16
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v50
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v11, v49, v32
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v15
-; SI-NEXT: v_mov_b32_e32 v15, v24
+; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50
+; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v50
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51
+; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v51
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v11, v50, v33
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v19
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51
+; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v51
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v19
+; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v52
+; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v3, v3, v39, 16
-; SI-NEXT: v_alignbit_b32 v4, v3, v4, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v11, v51, v19
+; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54
+; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v54
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v9, v5, 16
-; SI-NEXT: v_alignbit_b32 v5, v36, v7, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v11, v52, v35
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v20
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54
+; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v20
+; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v36
+; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v2, v6, 16
-; SI-NEXT: v_alignbit_b32 v6, v46, v33, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v11, v54, v20
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v36
+; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v44
+; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v44
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v24, v38, 16
-; SI-NEXT: v_alignbit_b32 v38, v48, v8, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v4, v22, v37, 16
+; SI-NEXT: v_or_b32_e32 v11, v55, v36
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v57, v32, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_alignbit_b32 v4, v20, v34, 16
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v20, v52
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v30
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v13, v14, 16
-; SI-NEXT: v_mov_b32_e32 v14, v51
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: .LBB105_5: ; %end
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v29
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v14
+; SI-NEXT: v_mov_b32_e32 v31, v28
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8
+; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53
+; SI-NEXT: v_alignbit_b32 v53, v26, v34, 16
+; SI-NEXT: v_alignbit_b32 v34, v3, v1, 16
+; SI-NEXT: v_alignbit_b32 v12, v31, v6, 16
+; SI-NEXT: v_alignbit_b32 v8, v22, v18, 16
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v6, v41, v32, 16
+; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10
+; SI-NEXT: v_alignbit_b32 v14, v7, v63, 16
+; SI-NEXT: v_alignbit_b32 v30, v56, v2, 16
+; SI-NEXT: v_alignbit_b32 v10, v13, v17, 16
+; SI-NEXT: v_mov_b32_e32 v22, v41
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_alignbit_b32 v51, v18, v36, 16
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_alignbit_b32 v2, v32, v35, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v21, v1
+; SI-NEXT: v_alignbit_b32 v5, v1, v33, 16
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v4, v1, v19, 16
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v33, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3
-; SI-NEXT: v_or_b32_e32 v7, v7, v8
-; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; SI-NEXT: v_alignbit_b32 v1, v19, v20, 16
+; SI-NEXT: .LBB105_5: ; %end
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v26
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_or_b32_e32 v4, v7, v4
-; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v16, 0xffff, v7
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7
+; SI-NEXT: v_or_b32_e32 v11, v11, v16
+; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v7
+; SI-NEXT: v_or_b32_e32 v11, v11, v14
+; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11
-; SI-NEXT: v_or_b32_e32 v4, v4, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v37
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v7, v7, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; SI-NEXT: v_or_b32_e32 v4, v4, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: v_or_b32_e32 v7, v7, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 16, v0
+; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v25
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v13
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v44
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v23
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; SI-NEXT: v_or_b32_e32 v3, v3, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0
+; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v6
+; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0
+; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v29
+; SI-NEXT: v_or_b32_e32 v3, v3, v6
+; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0
+; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v21
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v33
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v32
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0
+; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -228624,19 +229967,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s29, 16
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v14, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16
+; VI-NEXT: v_or_b32_sdwa v15, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
@@ -228648,9 +229991,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v13, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228666,9 +230009,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v12, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228684,9 +230027,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v11, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228702,9 +230045,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228720,9 +230063,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v9, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228738,9 +230081,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v8, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228756,9 +230099,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228774,9 +230117,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228792,9 +230135,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228810,9 +230153,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228828,9 +230171,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
@@ -228846,9 +230189,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v18, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1
@@ -228864,9 +230207,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18
; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16
+; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v18, s4, v0
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18
@@ -228882,8 +230225,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16
; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; VI-NEXT: v_bfe_u32 v33, v18, 16, 1
@@ -229140,38 +230483,38 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31
; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16
-; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16
-; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16
-; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16
-; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16
-; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16
-; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16
-; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16
-; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16
-; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16
-; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16
-; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16
-; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16
-; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16
-; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16
-; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16
+; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_or_b32_sdwa v31, v55, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v30, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v29, v53, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v28, v52, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v27, v51, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v26, v50, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v25, v49, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v24, v48, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v23, v39, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v22, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v20, v36, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v19, v35, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v32, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_branch .LBB105_5
; VI-NEXT: .LBB105_3:
; VI-NEXT: s_branch .LBB105_2
@@ -230643,8 +231986,8 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55
; SI-NEXT: ; kill: killed $vgpr3
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr27
; SI-NEXT: ; implicit-def: $vgpr25
; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr21
@@ -230706,10 +232049,10 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3
@@ -231057,7 +232400,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: v_or_b32_e32 v9, v20, v9
; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9
-; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6
; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -231066,7 +232409,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v10
@@ -231078,262 +232421,294 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
; SI-NEXT: .LBB106_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
@@ -231637,54 +233012,54 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: v_writelane_b32 v40, s85, 29
; SI-NEXT: v_writelane_b32 v40, s86, 30
; SI-NEXT: v_writelane_b32 v40, s87, 31
+; SI-NEXT: s_mov_b32 s74, s23
+; SI-NEXT: s_mov_b32 s72, s21
+; SI-NEXT: s_mov_b32 s61, s18
; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
; SI-NEXT: s_mov_b32 s60, s16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v41, s17, 0
-; SI-NEXT: s_mov_b32 s61, s19
; SI-NEXT: v_writelane_b32 v41, s60, 1
-; SI-NEXT: s_mov_b32 s63, s18
-; SI-NEXT: v_writelane_b32 v41, s61, 2
-; SI-NEXT: s_mov_b32 s72, s21
-; SI-NEXT: v_writelane_b32 v41, s63, 3
+; SI-NEXT: v_writelane_b32 v41, s19, 2
+; SI-NEXT: v_writelane_b32 v41, s61, 3
; SI-NEXT: v_writelane_b32 v41, s72, 4
-; SI-NEXT: s_mov_b32 s74, s23
; SI-NEXT: v_writelane_b32 v41, s20, 5
; SI-NEXT: v_writelane_b32 v41, s74, 6
-; SI-NEXT: s_mov_b32 s75, s25
+; SI-NEXT: s_mov_b32 s76, s25
; SI-NEXT: v_writelane_b32 v41, s22, 7
-; SI-NEXT: v_writelane_b32 v41, s75, 8
-; SI-NEXT: s_mov_b32 s76, s27
+; SI-NEXT: v_writelane_b32 v41, s76, 8
+; SI-NEXT: s_mov_b32 s78, s27
; SI-NEXT: v_writelane_b32 v41, s24, 9
-; SI-NEXT: v_writelane_b32 v41, s76, 10
-; SI-NEXT: s_mov_b32 s93, s29
+; SI-NEXT: v_writelane_b32 v41, s78, 10
+; SI-NEXT: s_mov_b32 s88, s29
; SI-NEXT: v_writelane_b32 v41, s26, 11
-; SI-NEXT: v_writelane_b32 v41, s93, 12
-; SI-NEXT: v_readfirstlane_b32 s16, v2
+; SI-NEXT: v_writelane_b32 v41, s88, 12
+; SI-NEXT: v_readfirstlane_b32 s77, v2
; SI-NEXT: v_writelane_b32 v41, s28, 13
-; SI-NEXT: v_readfirstlane_b32 s73, v4
-; SI-NEXT: v_writelane_b32 v41, s16, 14
-; SI-NEXT: v_readfirstlane_b32 s89, v3
-; SI-NEXT: v_writelane_b32 v41, s73, 15
-; SI-NEXT: v_readfirstlane_b32 s90, v6
-; SI-NEXT: v_writelane_b32 v41, s89, 16
-; SI-NEXT: v_readfirstlane_b32 s91, v5
-; SI-NEXT: v_writelane_b32 v41, s90, 17
-; SI-NEXT: v_readfirstlane_b32 s34, v8
-; SI-NEXT: v_writelane_b32 v41, s91, 18
-; SI-NEXT: v_readfirstlane_b32 s35, v7
-; SI-NEXT: v_writelane_b32 v41, s34, 19
-; SI-NEXT: v_readfirstlane_b32 s36, v10
-; SI-NEXT: v_writelane_b32 v41, s35, 20
-; SI-NEXT: v_writelane_b32 v40, s96, 32
-; SI-NEXT: v_readfirstlane_b32 s37, v9
-; SI-NEXT: v_writelane_b32 v41, s36, 21
+; SI-NEXT: v_readfirstlane_b32 s79, v4
+; SI-NEXT: v_writelane_b32 v41, s77, 14
+; SI-NEXT: v_readfirstlane_b32 s90, v3
+; SI-NEXT: v_writelane_b32 v41, s79, 15
+; SI-NEXT: v_readfirstlane_b32 s91, v6
+; SI-NEXT: v_writelane_b32 v41, s90, 16
+; SI-NEXT: v_readfirstlane_b32 s92, v5
+; SI-NEXT: v_writelane_b32 v41, s91, 17
+; SI-NEXT: v_readfirstlane_b32 s93, v8
+; SI-NEXT: v_writelane_b32 v41, s92, 18
+; SI-NEXT: v_readfirstlane_b32 s94, v7
+; SI-NEXT: v_writelane_b32 v41, s93, 19
+; SI-NEXT: v_readfirstlane_b32 s95, v10
+; SI-NEXT: v_writelane_b32 v41, s94, 20
+; SI-NEXT: v_readfirstlane_b32 s30, v9
+; SI-NEXT: v_writelane_b32 v41, s95, 21
+; SI-NEXT: v_readfirstlane_b32 s31, v12
+; SI-NEXT: v_writelane_b32 v41, s30, 22
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s62, v31
+; SI-NEXT: v_readfirstlane_b32 s21, v31
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s80, v32
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_readfirstlane_b32 s69, v33
+; SI-NEXT: v_readfirstlane_b32 s75, v33
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
@@ -231696,20 +233071,25 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s84, v34
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s68, v35
+; SI-NEXT: v_readfirstlane_b32 s23, v35
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s83, v36
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s87, v38
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80
-; SI-NEXT: v_readfirstlane_b32 s6, v37
+; SI-NEXT: v_readfirstlane_b32 s18, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT: v_writelane_b32 v41, s31, 23
+; SI-NEXT: v_readfirstlane_b32 s34, v11
+; SI-NEXT: v_readfirstlane_b32 s35, v14
+; SI-NEXT: v_readfirstlane_b32 s36, v13
+; SI-NEXT: v_writelane_b32 v40, s96, 32
+; SI-NEXT: v_readfirstlane_b32 s37, v16
; SI-NEXT: v_writelane_b32 v40, s97, 33
-; SI-NEXT: v_readfirstlane_b32 s38, v12
-; SI-NEXT: v_writelane_b32 v41, s37, 22
+; SI-NEXT: v_readfirstlane_b32 s38, v15
; SI-NEXT: v_writelane_b32 v40, s98, 34
; SI-NEXT: v_readfirstlane_b32 s14, v30
; SI-NEXT: v_readfirstlane_b32 s15, v29
@@ -231719,21 +233099,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: v_readfirstlane_b32 s11, v25
; SI-NEXT: v_readfirstlane_b32 s8, v24
; SI-NEXT: v_readfirstlane_b32 s9, v23
-; SI-NEXT: v_readfirstlane_b32 s88, v22
-; SI-NEXT: v_readfirstlane_b32 s29, v21
-; SI-NEXT: v_readfirstlane_b32 s79, v20
-; SI-NEXT: v_readfirstlane_b32 s27, v19
-; SI-NEXT: v_readfirstlane_b32 s78, v18
-; SI-NEXT: v_readfirstlane_b32 s25, v17
-; SI-NEXT: v_readfirstlane_b32 s77, v16
-; SI-NEXT: v_readfirstlane_b32 s23, v15
-; SI-NEXT: v_readfirstlane_b32 s39, v14
-; SI-NEXT: v_readfirstlane_b32 s21, v13
-; SI-NEXT: v_readfirstlane_b32 s19, v11
-; SI-NEXT: v_readfirstlane_b32 s18, v1
-; SI-NEXT: v_writelane_b32 v41, s38, 23
+; SI-NEXT: v_readfirstlane_b32 s89, v22
+; SI-NEXT: v_readfirstlane_b32 s7, v21
+; SI-NEXT: v_readfirstlane_b32 s25, v20
+; SI-NEXT: v_readfirstlane_b32 s29, v19
+; SI-NEXT: v_readfirstlane_b32 s39, v18
+; SI-NEXT: v_readfirstlane_b32 s27, v17
; SI-NEXT: v_writelane_b32 v40, s99, 35
-; SI-NEXT: v_writelane_b32 v41, s39, 24
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s58, v31
; SI-NEXT: s_waitcnt vmcnt(11)
@@ -231753,261 +233125,284 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s42, v34
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: v_readfirstlane_b32 s5, v1
+; SI-NEXT: v_writelane_b32 v41, s5, 24
+; SI-NEXT: v_writelane_b32 v41, s34, 25
+; SI-NEXT: v_writelane_b32 v41, s35, 26
+; SI-NEXT: v_writelane_b32 v41, s36, 27
+; SI-NEXT: v_writelane_b32 v41, s37, 28
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s43, v35
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_readfirstlane_b32 s40, v36
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s41, v37
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: v_writelane_b32 v41, s38, 29
+; SI-NEXT: v_writelane_b32 v41, s39, 30
; SI-NEXT: s_cbranch_scc0 .LBB107_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_lshl_b32 s4, s60, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 25
-; SI-NEXT: s_lshl_b32 s4, s63, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 26
-; SI-NEXT: s_lshl_b32 s4, s20, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 27
-; SI-NEXT: s_lshl_b32 s4, s22, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 28
-; SI-NEXT: s_lshl_b32 s4, s24, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 29
-; SI-NEXT: s_lshl_b32 s4, s26, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 30
-; SI-NEXT: s_lshl_b32 s4, s28, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 31
-; SI-NEXT: s_lshl_b32 s4, s18, 16
; SI-NEXT: v_writelane_b32 v41, s4, 32
-; SI-NEXT: s_lshl_b32 s4, s89, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 33
-; SI-NEXT: s_lshl_b32 s4, s91, 16
+; SI-NEXT: s_lshl_b32 s4, s17, 16
+; SI-NEXT: v_writelane_b32 v41, s4, 31
+; SI-NEXT: s_lshl_b32 s4, s61, 16
; SI-NEXT: v_writelane_b32 v41, s4, 34
-; SI-NEXT: s_lshl_b32 s4, s35, 16
-; SI-NEXT: v_writelane_b32 v41, s4, 35
-; SI-NEXT: s_lshl_b32 s4, s37, 16
-; SI-NEXT: s_lshl_b32 s7, s17, 16
-; SI-NEXT: s_lshl_b32 s96, s61, 16
-; SI-NEXT: s_lshl_b32 s99, s72, 16
-; SI-NEXT: s_lshl_b32 s97, s74, 16
-; SI-NEXT: s_lshl_b32 s92, s75, 16
-; SI-NEXT: s_lshl_b32 s94, s76, 16
-; SI-NEXT: s_lshl_b32 s95, s93, 16
-; SI-NEXT: s_lshl_b32 s93, s16, 16
-; SI-NEXT: s_lshl_b32 s30, s73, 16
-; SI-NEXT: s_lshl_b32 s31, s90, 16
-; SI-NEXT: s_lshl_b32 s34, s34, 16
+; SI-NEXT: s_lshl_b32 s4, s19, 16
+; SI-NEXT: v_writelane_b32 v41, s4, 33
+; SI-NEXT: s_lshl_b32 s4, s20, 16
; SI-NEXT: v_writelane_b32 v41, s4, 36
-; SI-NEXT: s_lshl_b32 s35, s36, 16
-; SI-NEXT: s_lshl_b32 s86, s19, 16
-; SI-NEXT: s_lshl_b32 s36, s38, 16
-; SI-NEXT: s_lshl_b32 s22, s21, 16
-; SI-NEXT: s_lshl_b32 s37, s39, 16
-; SI-NEXT: s_lshl_b32 s24, s23, 16
-; SI-NEXT: s_lshl_b32 s38, s77, 16
-; SI-NEXT: s_lshl_b32 s28, s25, 16
-; SI-NEXT: s_lshl_b32 s39, s78, 16
-; SI-NEXT: s_lshl_b32 s61, s27, 16
-; SI-NEXT: s_lshl_b32 s48, s79, 16
-; SI-NEXT: s_lshl_b32 s89, s29, 16
-; SI-NEXT: s_lshl_b32 s49, s88, 16
-; SI-NEXT: s_lshl_b32 s60, s9, 16
-; SI-NEXT: s_lshl_b32 s50, s8, 16
-; SI-NEXT: s_lshl_b32 s90, s11, 16
-; SI-NEXT: s_lshl_b32 s91, s10, 16
-; SI-NEXT: s_lshl_b32 s70, s13, 16
-; SI-NEXT: s_lshl_b32 s51, s12, 16
-; SI-NEXT: s_lshl_b32 s71, s15, 16
-; SI-NEXT: s_lshl_b32 s52, s14, 16
-; SI-NEXT: s_lshl_b32 s20, s41, 16
-; SI-NEXT: s_lshl_b32 s53, s40, 16
-; SI-NEXT: s_lshl_b32 s81, s43, 16
-; SI-NEXT: s_lshl_b32 s54, s42, 16
-; SI-NEXT: s_lshl_b32 s63, s45, 16
-; SI-NEXT: s_lshl_b32 s55, s44, 16
-; SI-NEXT: s_lshl_b32 s72, s47, 16
-; SI-NEXT: s_lshl_b32 s64, s46, 16
-; SI-NEXT: s_lshl_b32 s82, s57, 16
-; SI-NEXT: s_lshl_b32 s65, s56, 16
-; SI-NEXT: s_lshl_b32 s74, s59, 16
-; SI-NEXT: s_lshl_b32 s66, s58, 16
-; SI-NEXT: s_lshl_b32 s75, s87, 16
-; SI-NEXT: s_mov_b32 s73, s6
-; SI-NEXT: s_lshl_b32 s67, s6, 16
-; SI-NEXT: s_lshl_b32 s76, s83, 16
-; SI-NEXT: s_mov_b32 s16, s68
-; SI-NEXT: s_lshl_b32 s68, s68, 16
-; SI-NEXT: s_lshl_b32 s85, s84, 16
-; SI-NEXT: s_mov_b32 s98, s69
-; SI-NEXT: s_lshl_b32 s69, s69, 16
-; SI-NEXT: s_lshl_b32 s17, s80, 16
-; SI-NEXT: s_mov_b32 s6, s62
-; SI-NEXT: s_lshl_b32 s26, s62, 16
+; SI-NEXT: s_lshl_b32 s4, s72, 16
+; SI-NEXT: v_writelane_b32 v41, s4, 35
+; SI-NEXT: s_lshl_b32 s4, s74, 16
+; SI-NEXT: s_lshl_b32 s16, s22, 16
+; SI-NEXT: v_writelane_b32 v41, s4, 37
+; SI-NEXT: s_lshl_b32 s6, s24, 16
+; SI-NEXT: s_lshl_b32 s73, s76, 16
+; SI-NEXT: s_lshl_b32 s98, s26, 16
+; SI-NEXT: s_lshl_b32 s63, s78, 16
+; SI-NEXT: s_lshl_b32 s97, s28, 16
+; SI-NEXT: s_lshl_b32 s62, s88, 16
+; SI-NEXT: s_lshl_b32 s85, s5, 16
+; SI-NEXT: s_lshl_b32 s96, s77, 16
+; SI-NEXT: s_lshl_b32 s81, s90, 16
+; SI-NEXT: s_lshl_b32 s99, s79, 16
+; SI-NEXT: s_lshl_b32 s70, s92, 16
+; SI-NEXT: s_lshl_b32 s86, s91, 16
+; SI-NEXT: s_lshl_b32 s68, s94, 16
+; SI-NEXT: s_lshl_b32 s82, s93, 16
+; SI-NEXT: s_lshl_b32 s66, s30, 16
+; SI-NEXT: s_lshl_b32 s71, s95, 16
+; SI-NEXT: s_lshl_b32 s64, s34, 16
+; SI-NEXT: s_lshl_b32 s69, s31, 16
+; SI-NEXT: s_lshl_b32 s54, s36, 16
+; SI-NEXT: s_lshl_b32 s67, s35, 16
+; SI-NEXT: s_lshl_b32 s52, s38, 16
+; SI-NEXT: s_lshl_b32 s65, s37, 16
+; SI-NEXT: s_lshl_b32 s50, s27, 16
+; SI-NEXT: s_lshl_b32 s55, s39, 16
+; SI-NEXT: s_lshl_b32 s49, s29, 16
+; SI-NEXT: s_lshl_b32 s53, s25, 16
+; SI-NEXT: s_lshl_b32 s51, s7, 16
+; SI-NEXT: s_lshl_b32 s39, s89, 16
+; SI-NEXT: s_lshl_b32 s48, s9, 16
+; SI-NEXT: s_lshl_b32 s38, s8, 16
+; SI-NEXT: s_lshl_b32 s37, s11, 16
+; SI-NEXT: s_lshl_b32 s35, s10, 16
+; SI-NEXT: s_lshl_b32 s36, s13, 16
+; SI-NEXT: s_lshl_b32 s31, s12, 16
+; SI-NEXT: s_lshl_b32 s34, s15, 16
+; SI-NEXT: s_lshl_b32 s95, s14, 16
+; SI-NEXT: s_lshl_b32 s30, s41, 16
+; SI-NEXT: s_lshl_b32 s93, s40, 16
+; SI-NEXT: s_lshl_b32 s94, s43, 16
+; SI-NEXT: s_lshl_b32 s91, s42, 16
+; SI-NEXT: s_lshl_b32 s92, s45, 16
+; SI-NEXT: s_lshl_b32 s90, s44, 16
+; SI-NEXT: s_lshl_b32 s28, s47, 16
+; SI-NEXT: s_lshl_b32 s88, s46, 16
+; SI-NEXT: s_lshl_b32 s26, s57, 16
+; SI-NEXT: s_lshl_b32 s78, s56, 16
+; SI-NEXT: s_lshl_b32 s24, s59, 16
+; SI-NEXT: s_lshl_b32 s76, s58, 16
+; SI-NEXT: s_lshl_b32 s22, s87, 16
+; SI-NEXT: s_mov_b32 s77, s18
+; SI-NEXT: s_lshl_b32 s74, s18, 16
+; SI-NEXT: s_lshl_b32 s20, s83, 16
+; SI-NEXT: s_mov_b32 s79, s23
+; SI-NEXT: s_lshl_b32 s72, s23, 16
+; SI-NEXT: s_lshl_b32 s19, s84, 16
+; SI-NEXT: s_mov_b32 s18, s75
+; SI-NEXT: s_lshl_b32 s17, s75, 16
+; SI-NEXT: s_lshl_b32 s61, s80, 16
+; SI-NEXT: s_lshl_b32 s60, s21, 16
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_branch .LBB107_3
; SI-NEXT: .LBB107_2:
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: s_mov_b32 s16, s68
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: s_mov_b32 s73, s6
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: s_mov_b32 s6, s62
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: s_mov_b32 s98, s69
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: s_mov_b32 s79, s23
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: s_mov_b32 s77, s18
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: s_mov_b32 s18, s75
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
; SI-NEXT: s_mov_b64 s[4:5], -1
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: ; implicit-def: $sgpr7
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr16
+; SI-NEXT: ; implicit-def: $sgpr73
+; SI-NEXT: ; implicit-def: $sgpr98
+; SI-NEXT: ; implicit-def: $sgpr63
+; SI-NEXT: ; implicit-def: $sgpr97
+; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr85
; SI-NEXT: ; implicit-def: $sgpr96
+; SI-NEXT: ; implicit-def: $sgpr81
; SI-NEXT: ; implicit-def: $sgpr99
-; SI-NEXT: ; implicit-def: $sgpr97
-; SI-NEXT: ; implicit-def: $sgpr92
-; SI-NEXT: ; implicit-def: $sgpr94
-; SI-NEXT: ; implicit-def: $sgpr95
-; SI-NEXT: ; implicit-def: $sgpr93
-; SI-NEXT: ; implicit-def: $sgpr30
-; SI-NEXT: ; implicit-def: $sgpr31
-; SI-NEXT: ; implicit-def: $sgpr34
-; SI-NEXT: ; implicit-def: $sgpr35
+; SI-NEXT: ; implicit-def: $sgpr70
; SI-NEXT: ; implicit-def: $sgpr86
-; SI-NEXT: ; implicit-def: $sgpr36
-; SI-NEXT: ; implicit-def: $sgpr22
-; SI-NEXT: ; implicit-def: $sgpr37
-; SI-NEXT: ; implicit-def: $sgpr24
-; SI-NEXT: ; implicit-def: $sgpr38
-; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr68
+; SI-NEXT: ; implicit-def: $sgpr82
+; SI-NEXT: ; implicit-def: $sgpr66
+; SI-NEXT: ; implicit-def: $sgpr71
+; SI-NEXT: ; implicit-def: $sgpr64
+; SI-NEXT: ; implicit-def: $sgpr69
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: ; implicit-def: $sgpr67
+; SI-NEXT: ; implicit-def: $sgpr52
+; SI-NEXT: ; implicit-def: $sgpr65
+; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: ; implicit-def: $sgpr55
+; SI-NEXT: ; implicit-def: $sgpr49
+; SI-NEXT: ; implicit-def: $sgpr53
+; SI-NEXT: ; implicit-def: $sgpr51
; SI-NEXT: ; implicit-def: $sgpr39
-; SI-NEXT: ; implicit-def: $sgpr61
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr89
-; SI-NEXT: ; implicit-def: $sgpr49
-; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr50
-; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr38
+; SI-NEXT: ; implicit-def: $sgpr37
+; SI-NEXT: ; implicit-def: $sgpr35
+; SI-NEXT: ; implicit-def: $sgpr36
+; SI-NEXT: ; implicit-def: $sgpr31
+; SI-NEXT: ; implicit-def: $sgpr34
+; SI-NEXT: ; implicit-def: $sgpr95
+; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; implicit-def: $sgpr93
+; SI-NEXT: ; implicit-def: $sgpr94
; SI-NEXT: ; implicit-def: $sgpr91
-; SI-NEXT: ; implicit-def: $sgpr70
-; SI-NEXT: ; implicit-def: $sgpr51
-; SI-NEXT: ; implicit-def: $sgpr71
-; SI-NEXT: ; implicit-def: $sgpr52
+; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr88
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr78
+; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr76
+; SI-NEXT: ; implicit-def: $sgpr22
+; SI-NEXT: ; implicit-def: $sgpr74
; SI-NEXT: ; implicit-def: $sgpr20
-; SI-NEXT: ; implicit-def: $sgpr53
-; SI-NEXT: ; implicit-def: $sgpr81
-; SI-NEXT: ; implicit-def: $sgpr54
-; SI-NEXT: ; implicit-def: $sgpr63
-; SI-NEXT: ; implicit-def: $sgpr55
; SI-NEXT: ; implicit-def: $sgpr72
-; SI-NEXT: ; implicit-def: $sgpr64
-; SI-NEXT: ; implicit-def: $sgpr82
-; SI-NEXT: ; implicit-def: $sgpr65
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr66
-; SI-NEXT: ; implicit-def: $sgpr75
-; SI-NEXT: ; implicit-def: $sgpr67
-; SI-NEXT: ; implicit-def: $sgpr76
-; SI-NEXT: ; implicit-def: $sgpr68
-; SI-NEXT: ; implicit-def: $sgpr85
-; SI-NEXT: ; implicit-def: $sgpr69
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
-; SI-NEXT: ; implicit-def: $sgpr17
-; SI-NEXT: ; kill: killed $sgpr17
+; SI-NEXT: ; implicit-def: $sgpr19
; SI-NEXT: ; implicit-def: $sgpr17
+; SI-NEXT: ; implicit-def: $sgpr61
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: .LBB107_3: ; %Flow
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT: s_mov_b32 s5, s17
-; SI-NEXT: s_mov_b32 s17, s86
-; SI-NEXT: s_mov_b32 s86, s7
+; SI-NEXT: s_mov_b32 s5, s60
+; SI-NEXT: s_mov_b32 s60, s17
+; SI-NEXT: s_mov_b32 s4, s61
+; SI-NEXT: s_mov_b32 s17, s72
+; SI-NEXT: s_mov_b32 s61, s74
+; SI-NEXT: s_mov_b32 s72, s76
+; SI-NEXT: s_mov_b32 s74, s78
+; SI-NEXT: s_mov_b32 s76, s88
+; SI-NEXT: s_mov_b32 s78, s90
+; SI-NEXT: s_mov_b32 s88, s91
+; SI-NEXT: s_mov_b32 s90, s92
+; SI-NEXT: s_mov_b32 s91, s93
+; SI-NEXT: s_mov_b32 s92, s94
+; SI-NEXT: s_mov_b32 s93, s95
+; SI-NEXT: s_mov_b32 s94, s30
+; SI-NEXT: s_mov_b32 s95, s31
+; SI-NEXT: s_mov_b32 s30, s34
+; SI-NEXT: s_mov_b32 s31, s35
+; SI-NEXT: s_mov_b32 s34, s36
+; SI-NEXT: s_mov_b32 s35, s37
+; SI-NEXT: s_mov_b32 s36, s48
+; SI-NEXT: s_mov_b32 s37, s51
+; SI-NEXT: s_mov_b32 s48, s53
+; SI-NEXT: s_mov_b32 s51, s6
+; SI-NEXT: s_mov_b32 s53, s16
; SI-NEXT: s_cbranch_vccnz .LBB107_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_lshl_b32 s5, s6, 16
-; SI-NEXT: v_readlane_b32 s6, v41, 24
-; SI-NEXT: s_lshl_b32 s20, s6, 16
-; SI-NEXT: v_readlane_b32 s6, v41, 23
-; SI-NEXT: s_lshl_b32 s17, s6, 16
-; SI-NEXT: v_readlane_b32 s6, v41, 22
-; SI-NEXT: s_lshl_b32 s61, s16, 16
-; SI-NEXT: s_add_i32 s16, s6, 3
-; SI-NEXT: v_readlane_b32 s6, v41, 21
-; SI-NEXT: s_and_b32 s16, s16, 0xffff
-; SI-NEXT: s_lshl_b32 s7, s6, 16
-; SI-NEXT: v_readlane_b32 s6, v41, 20
-; SI-NEXT: s_or_b32 s7, s7, s16
-; SI-NEXT: s_add_i32 s6, s6, 3
-; SI-NEXT: v_readlane_b32 s16, v41, 19
-; SI-NEXT: s_add_i32 s19, s19, 3
-; SI-NEXT: s_and_b32 s6, s6, 0xffff
-; SI-NEXT: s_lshl_b32 s16, s16, 16
-; SI-NEXT: s_and_b32 s19, s19, 0xffff
-; SI-NEXT: s_or_b32 s6, s16, s6
-; SI-NEXT: v_readlane_b32 s16, v41, 18
-; SI-NEXT: s_lshl_b32 s60, s98, 16
-; SI-NEXT: s_or_b32 s17, s17, s19
-; SI-NEXT: s_add_i32 s98, s16, 3
-; SI-NEXT: v_readlane_b32 s19, v41, 17
-; SI-NEXT: s_add_i32 s21, s21, 3
-; SI-NEXT: s_and_b32 s16, s98, 0xffff
-; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_add_i32 s11, s11, 3
-; SI-NEXT: s_add_i32 s9, s9, 3
-; SI-NEXT: s_and_b32 s21, s21, 0xffff
-; SI-NEXT: s_or_b32 s16, s19, s16
-; SI-NEXT: v_readlane_b32 s19, v41, 16
; SI-NEXT: s_add_i32 s13, s13, 3
+; SI-NEXT: s_and_b32 s13, s13, 0xffff
+; SI-NEXT: s_lshl_b32 s12, s12, 16
+; SI-NEXT: s_add_i32 s11, s11, 3
+; SI-NEXT: v_readlane_b32 s6, v41, 30
+; SI-NEXT: s_add_i32 s15, s15, 3
+; SI-NEXT: s_or_b32 s12, s12, s13
; SI-NEXT: s_and_b32 s11, s11, 0xffff
; SI-NEXT: s_lshl_b32 s10, s10, 16
+; SI-NEXT: s_lshl_b32 s13, s6, 16
+; SI-NEXT: v_readlane_b32 s6, v41, 29
+; SI-NEXT: s_and_b32 s15, s15, 0xffff
+; SI-NEXT: s_lshl_b32 s14, s14, 16
+; SI-NEXT: s_or_b32 s10, s10, s11
+; SI-NEXT: s_lshl_b32 s11, s25, 16
+; SI-NEXT: s_add_i32 s25, s6, 3
+; SI-NEXT: v_readlane_b32 s6, v41, 28
+; SI-NEXT: s_or_b32 s14, s14, s15
+; SI-NEXT: s_lshl_b32 s15, s6, 16
+; SI-NEXT: v_readlane_b32 s6, v41, 27
+; SI-NEXT: s_add_i32 s23, s6, 3
+; SI-NEXT: v_readlane_b32 s6, v41, 26
+; SI-NEXT: s_add_i32 s9, s9, 3
+; SI-NEXT: s_lshl_b32 s20, s6, 16
+; SI-NEXT: v_readlane_b32 s6, v41, 25
+; SI-NEXT: s_lshl_b32 s5, s21, 16
; SI-NEXT: s_and_b32 s9, s9, 0xffff
; SI-NEXT: s_lshl_b32 s8, s8, 16
-; SI-NEXT: s_add_i32 s29, s29, 3
-; SI-NEXT: s_or_b32 s20, s20, s21
-; SI-NEXT: s_add_i32 s96, s19, 3
-; SI-NEXT: v_readlane_b32 s21, v41, 15
-; SI-NEXT: s_add_i32 s15, s15, 3
-; SI-NEXT: s_and_b32 s13, s13, 0xffff
-; SI-NEXT: s_lshl_b32 s12, s12, 16
-; SI-NEXT: s_or_b32 s10, s10, s11
+; SI-NEXT: s_add_i32 s7, s7, 3
+; SI-NEXT: s_add_i32 s21, s6, 3
+; SI-NEXT: v_readlane_b32 s6, v41, 23
; SI-NEXT: s_or_b32 s8, s8, s9
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: s_lshl_b32 s9, s89, 16
+; SI-NEXT: s_add_i32 s29, s29, 3
+; SI-NEXT: s_lshl_b32 s19, s6, 16
+; SI-NEXT: v_readlane_b32 s6, v41, 22
+; SI-NEXT: s_or_b32 s7, s9, s7
; SI-NEXT: s_and_b32 s9, s29, 0xffff
-; SI-NEXT: s_lshl_b32 s11, s88, 16
; SI-NEXT: s_add_i32 s27, s27, 3
-; SI-NEXT: s_and_b32 s19, s96, 0xffff
-; SI-NEXT: s_lshl_b32 s21, s21, 16
-; SI-NEXT: s_and_b32 s15, s15, 0xffff
-; SI-NEXT: s_lshl_b32 s14, s14, 16
-; SI-NEXT: s_or_b32 s12, s12, s13
+; SI-NEXT: s_add_i32 s16, s6, 3
+; SI-NEXT: v_readlane_b32 s6, v41, 21
; SI-NEXT: s_or_b32 s9, s11, s9
; SI-NEXT: s_and_b32 s11, s27, 0xffff
-; SI-NEXT: s_lshl_b32 s13, s79, 16
-; SI-NEXT: s_add_i32 s25, s25, 3
-; SI-NEXT: s_or_b32 s19, s21, s19
-; SI-NEXT: s_add_i32 s18, s18, 3
-; SI-NEXT: v_readlane_b32 s21, v41, 14
-; SI-NEXT: s_or_b32 s14, s14, s15
+; SI-NEXT: s_and_b32 s16, s16, 0xffff
+; SI-NEXT: s_lshl_b32 s17, s6, 16
+; SI-NEXT: v_readlane_b32 s6, v41, 20
; SI-NEXT: s_or_b32 s11, s13, s11
; SI-NEXT: s_and_b32 s13, s25, 0xffff
-; SI-NEXT: s_lshl_b32 s15, s78, 16
-; SI-NEXT: s_add_i32 s23, s23, 3
-; SI-NEXT: s_and_b32 s18, s18, 0xffff
-; SI-NEXT: s_lshl_b32 s21, s21, 16
+; SI-NEXT: s_or_b32 s16, s17, s16
+; SI-NEXT: s_add_i32 s6, s6, 3
+; SI-NEXT: v_readlane_b32 s17, v41, 19
; SI-NEXT: s_or_b32 s13, s15, s13
; SI-NEXT: s_and_b32 s15, s23, 0xffff
-; SI-NEXT: s_lshl_b32 s22, s77, 16
+; SI-NEXT: s_and_b32 s6, s6, 0xffff
+; SI-NEXT: s_lshl_b32 s17, s17, 16
+; SI-NEXT: s_lshl_b32 s60, s18, 16
+; SI-NEXT: s_or_b32 s15, s20, s15
+; SI-NEXT: s_and_b32 s20, s21, 0xffff
+; SI-NEXT: s_or_b32 s6, s17, s6
+; SI-NEXT: v_readlane_b32 s17, v41, 18
+; SI-NEXT: v_readlane_b32 s18, v41, 17
+; SI-NEXT: s_or_b32 s19, s19, s20
+; SI-NEXT: s_add_i32 s98, s17, 3
+; SI-NEXT: s_lshl_b32 s20, s18, 16
+; SI-NEXT: v_readlane_b32 s18, v41, 16
+; SI-NEXT: s_and_b32 s17, s98, 0xffff
+; SI-NEXT: s_add_i32 s96, s18, 3
+; SI-NEXT: v_readlane_b32 s18, v41, 15
+; SI-NEXT: s_or_b32 s17, s20, s17
+; SI-NEXT: s_and_b32 s20, s96, 0xffff
+; SI-NEXT: s_lshl_b32 s21, s18, 16
+; SI-NEXT: v_readlane_b32 s18, v41, 24
+; SI-NEXT: s_or_b32 s20, s21, s20
+; SI-NEXT: s_add_i32 s18, s18, 3
+; SI-NEXT: v_readlane_b32 s21, v41, 14
+; SI-NEXT: s_and_b32 s18, s18, 0xffff
+; SI-NEXT: s_lshl_b32 s21, s21, 16
; SI-NEXT: s_or_b32 s18, s21, s18
; SI-NEXT: v_readlane_b32 s21, v41, 13
-; SI-NEXT: s_or_b32 s15, s22, s15
; SI-NEXT: s_add_i32 s21, s21, 3
; SI-NEXT: v_readlane_b32 s22, v41, 12
; SI-NEXT: s_and_b32 s21, s21, 0xffff
@@ -232049,42 +233444,20 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_and_b32 s27, s27, 0xffff
; SI-NEXT: s_lshl_b32 s28, s28, 16
; SI-NEXT: s_or_b32 s27, s28, s27
-; SI-NEXT: s_add_i32 s27, s27, 0x30000
-; SI-NEXT: s_add_i32 s26, s26, 0x30000
-; SI-NEXT: s_and_b32 s86, s27, 0xffff0000
-; SI-NEXT: s_lshl_b32 s27, s27, 16
-; SI-NEXT: s_add_i32 s25, s25, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s27, 25
-; SI-NEXT: s_and_b32 s96, s26, 0xffff0000
-; SI-NEXT: s_lshl_b32 s26, s26, 16
-; SI-NEXT: s_add_i32 s24, s24, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s26, 26
-; SI-NEXT: s_and_b32 s99, s25, 0xffff0000
-; SI-NEXT: s_lshl_b32 s25, s25, 16
-; SI-NEXT: s_add_i32 s23, s23, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s25, 27
-; SI-NEXT: s_and_b32 s97, s24, 0xffff0000
-; SI-NEXT: s_lshl_b32 s24, s24, 16
; SI-NEXT: s_add_i32 s80, s80, 3
-; SI-NEXT: s_add_i32 s22, s22, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s24, 28
-; SI-NEXT: s_and_b32 s92, s23, 0xffff0000
-; SI-NEXT: s_lshl_b32 s23, s23, 16
+; SI-NEXT: s_add_i32 s27, s27, 0x30000
; SI-NEXT: s_and_b32 s4, s80, 0xffff
; SI-NEXT: s_add_i32 s84, s84, 3
-; SI-NEXT: s_add_i32 s21, s21, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s23, 29
-; SI-NEXT: s_and_b32 s94, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s22, s22, 16
+; SI-NEXT: s_and_b32 s28, s27, 0xffff0000
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_and_b32 s5, s84, 0xffff
; SI-NEXT: s_add_i32 s83, s83, 3
-; SI-NEXT: s_add_i32 s18, s18, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s22, 30
-; SI-NEXT: s_and_b32 s95, s21, 0xffff0000
-; SI-NEXT: s_lshl_b32 s21, s21, 16
+; SI-NEXT: s_add_i32 s26, s26, 0x30000
+; SI-NEXT: v_writelane_b32 v41, s28, 31
+; SI-NEXT: s_lshl_b32 s27, s27, 16
; SI-NEXT: s_or_b32 s5, s60, s5
; SI-NEXT: s_and_b32 s60, s83, 0xffff
+; SI-NEXT: s_lshl_b32 s61, s79, 16
; SI-NEXT: s_add_i32 s87, s87, 3
; SI-NEXT: s_add_i32 s59, s59, 3
; SI-NEXT: s_add_i32 s57, s57, 3
@@ -232092,13 +233465,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_add_i32 s45, s45, 3
; SI-NEXT: s_add_i32 s43, s43, 3
; SI-NEXT: s_add_i32 s41, s41, 3
-; SI-NEXT: s_add_i32 s19, s19, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s21, 31
-; SI-NEXT: s_and_b32 s93, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s18, s18, 16
-; SI-NEXT: s_or_b32 s76, s61, s60
+; SI-NEXT: v_writelane_b32 v41, s27, 32
+; SI-NEXT: s_and_b32 s27, s26, 0xffff0000
+; SI-NEXT: s_or_b32 vcc_lo, s61, s60
; SI-NEXT: s_and_b32 s60, s87, 0xffff
-; SI-NEXT: s_lshl_b32 s61, s73, 16
+; SI-NEXT: s_lshl_b32 s61, s77, 16
; SI-NEXT: s_and_b32 s59, s59, 0xffff
; SI-NEXT: s_lshl_b32 s58, s58, 16
; SI-NEXT: s_and_b32 s57, s57, 0xffff
@@ -232111,24 +233482,22 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_lshl_b32 s42, s42, 16
; SI-NEXT: s_and_b32 s41, s41, 0xffff
; SI-NEXT: s_lshl_b32 s40, s40, 16
-; SI-NEXT: s_add_i32 s16, s16, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s18, 32
-; SI-NEXT: s_lshl_b32 s18, s19, 16
-; SI-NEXT: s_or_b32 s75, s61, s60
+; SI-NEXT: s_add_i32 s25, s25, 0x30000
+; SI-NEXT: v_writelane_b32 v41, s27, 33
+; SI-NEXT: s_lshl_b32 s26, s26, 16
+; SI-NEXT: s_or_b32 vcc_hi, s61, s60
; SI-NEXT: s_or_b32 s58, s58, s59
; SI-NEXT: s_or_b32 s56, s56, s57
; SI-NEXT: s_or_b32 s46, s46, s47
; SI-NEXT: s_or_b32 s44, s44, s45
; SI-NEXT: s_or_b32 s42, s42, s43
; SI-NEXT: s_or_b32 s40, s40, s41
-; SI-NEXT: s_add_i32 s6, s6, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s18, 33
-; SI-NEXT: s_and_b32 s31, s16, 0xffff0000
-; SI-NEXT: s_lshl_b32 s16, s16, 16
+; SI-NEXT: v_writelane_b32 v41, s26, 34
+; SI-NEXT: s_and_b32 s26, s25, 0xffff0000
; SI-NEXT: s_add_i32 s4, s4, 0x30000
; SI-NEXT: s_add_i32 s5, s5, 0x30000
-; SI-NEXT: s_add_i32 s76, s76, 0x30000
-; SI-NEXT: s_add_i32 s75, s75, 0x30000
+; SI-NEXT: s_add_i32 vcc_lo, vcc_lo, 0x30000
+; SI-NEXT: s_add_i32 vcc_hi, vcc_hi, 0x30000
; SI-NEXT: s_add_i32 s58, s58, 0x30000
; SI-NEXT: s_add_i32 s56, s56, 0x30000
; SI-NEXT: s_add_i32 s46, s46, 0x30000
@@ -232139,294 +233508,343 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a
; SI-NEXT: s_add_i32 s12, s12, 0x30000
; SI-NEXT: s_add_i32 s10, s10, 0x30000
; SI-NEXT: s_add_i32 s8, s8, 0x30000
+; SI-NEXT: s_add_i32 s7, s7, 0x30000
; SI-NEXT: s_add_i32 s9, s9, 0x30000
; SI-NEXT: s_add_i32 s11, s11, 0x30000
; SI-NEXT: s_add_i32 s13, s13, 0x30000
; SI-NEXT: s_add_i32 s15, s15, 0x30000
-; SI-NEXT: s_add_i32 s20, s20, 0x30000
+; SI-NEXT: s_add_i32 s19, s19, 0x30000
+; SI-NEXT: s_add_i32 s16, s16, 0x30000
+; SI-NEXT: s_add_i32 s6, s6, 0x30000
; SI-NEXT: s_add_i32 s17, s17, 0x30000
-; SI-NEXT: s_add_i32 s7, s7, 0x30000
-; SI-NEXT: v_writelane_b32 v41, s16, 34
-; SI-NEXT: s_and_b32 s34, s6, 0xffff0000
-; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_and_b32 s30, s19, 0xffff0000
-; SI-NEXT: v_writelane_b32 v41, s6, 35
-; SI-NEXT: s_and_b32 s35, s7, 0xffff0000
-; SI-NEXT: s_lshl_b32 s6, s7, 16
-; SI-NEXT: s_and_b32 s36, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s17, s17, 16
-; SI-NEXT: s_and_b32 s37, s20, 0xffff0000
-; SI-NEXT: s_lshl_b32 s22, s20, 16
-; SI-NEXT: s_and_b32 s38, s15, 0xffff0000
-; SI-NEXT: s_lshl_b32 s24, s15, 16
-; SI-NEXT: s_and_b32 s39, s13, 0xffff0000
-; SI-NEXT: s_lshl_b32 s28, s13, 16
-; SI-NEXT: s_and_b32 s48, s11, 0xffff0000
-; SI-NEXT: s_lshl_b32 s61, s11, 16
-; SI-NEXT: s_and_b32 s49, s9, 0xffff0000
-; SI-NEXT: s_lshl_b32 s89, s9, 16
-; SI-NEXT: s_and_b32 s50, s8, 0xffff0000
-; SI-NEXT: s_lshl_b32 s60, s8, 16
-; SI-NEXT: s_and_b32 s91, s10, 0xffff0000
-; SI-NEXT: s_lshl_b32 s90, s10, 16
-; SI-NEXT: s_and_b32 s51, s12, 0xffff0000
-; SI-NEXT: s_lshl_b32 s70, s12, 16
-; SI-NEXT: s_and_b32 s52, s14, 0xffff0000
-; SI-NEXT: s_lshl_b32 s71, s14, 16
-; SI-NEXT: s_and_b32 s53, s40, 0xffff0000
-; SI-NEXT: s_lshl_b32 s20, s40, 16
-; SI-NEXT: s_and_b32 s54, s42, 0xffff0000
-; SI-NEXT: s_lshl_b32 s81, s42, 16
-; SI-NEXT: s_and_b32 s55, s44, 0xffff0000
-; SI-NEXT: s_lshl_b32 s63, s44, 16
-; SI-NEXT: s_and_b32 s64, s46, 0xffff0000
-; SI-NEXT: s_lshl_b32 s72, s46, 16
-; SI-NEXT: s_and_b32 s65, s56, 0xffff0000
-; SI-NEXT: s_lshl_b32 s82, s56, 16
-; SI-NEXT: s_and_b32 s66, s58, 0xffff0000
-; SI-NEXT: s_lshl_b32 s74, s58, 16
-; SI-NEXT: s_and_b32 s67, s75, 0xffff0000
-; SI-NEXT: s_lshl_b32 s75, s75, 16
-; SI-NEXT: s_and_b32 s68, s76, 0xffff0000
-; SI-NEXT: s_lshl_b32 s76, s76, 16
-; SI-NEXT: s_and_b32 s69, s5, 0xffff0000
-; SI-NEXT: s_lshl_b32 s85, s5, 16
-; SI-NEXT: s_and_b32 s26, s4, 0xffff0000
-; SI-NEXT: s_lshl_b32 s5, s4, 16
-; SI-NEXT: v_writelane_b32 v41, s6, 36
+; SI-NEXT: s_add_i32 s20, s20, 0x30000
+; SI-NEXT: s_add_i32 s18, s18, 0x30000
+; SI-NEXT: s_add_i32 s21, s21, 0x30000
+; SI-NEXT: s_add_i32 s22, s22, 0x30000
+; SI-NEXT: s_add_i32 s23, s23, 0x30000
+; SI-NEXT: s_add_i32 s24, s24, 0x30000
+; SI-NEXT: v_writelane_b32 v41, s26, 35
+; SI-NEXT: s_lshl_b32 s25, s25, 16
+; SI-NEXT: v_writelane_b32 v41, s25, 36
+; SI-NEXT: s_and_b32 s25, s24, 0xffff0000
+; SI-NEXT: s_lshl_b32 s53, s24, 16
+; SI-NEXT: s_and_b32 s73, s23, 0xffff0000
+; SI-NEXT: s_lshl_b32 s51, s23, 16
+; SI-NEXT: s_and_b32 s63, s22, 0xffff0000
+; SI-NEXT: s_lshl_b32 s98, s22, 16
+; SI-NEXT: s_and_b32 s62, s21, 0xffff0000
+; SI-NEXT: s_lshl_b32 s97, s21, 16
+; SI-NEXT: s_and_b32 s96, s18, 0xffff0000
+; SI-NEXT: s_lshl_b32 s85, s18, 16
+; SI-NEXT: s_and_b32 s99, s20, 0xffff0000
+; SI-NEXT: s_lshl_b32 s81, s20, 16
+; SI-NEXT: s_and_b32 s86, s17, 0xffff0000
+; SI-NEXT: s_lshl_b32 s70, s17, 16
+; SI-NEXT: s_and_b32 s82, s6, 0xffff0000
+; SI-NEXT: s_lshl_b32 s68, s6, 16
+; SI-NEXT: s_and_b32 s71, s16, 0xffff0000
+; SI-NEXT: s_lshl_b32 s66, s16, 16
+; SI-NEXT: s_and_b32 s69, s19, 0xffff0000
+; SI-NEXT: s_lshl_b32 s64, s19, 16
+; SI-NEXT: s_and_b32 s67, s15, 0xffff0000
+; SI-NEXT: s_lshl_b32 s54, s15, 16
+; SI-NEXT: s_and_b32 s65, s13, 0xffff0000
+; SI-NEXT: s_lshl_b32 s52, s13, 16
+; SI-NEXT: s_and_b32 s55, s11, 0xffff0000
+; SI-NEXT: s_lshl_b32 s50, s11, 16
+; SI-NEXT: s_and_b32 s48, s9, 0xffff0000
+; SI-NEXT: s_lshl_b32 s49, s9, 16
+; SI-NEXT: s_and_b32 s39, s7, 0xffff0000
+; SI-NEXT: s_lshl_b32 s37, s7, 16
+; SI-NEXT: s_and_b32 s38, s8, 0xffff0000
+; SI-NEXT: s_lshl_b32 s36, s8, 16
+; SI-NEXT: s_and_b32 s31, s10, 0xffff0000
+; SI-NEXT: s_lshl_b32 s35, s10, 16
+; SI-NEXT: s_and_b32 s95, s12, 0xffff0000
+; SI-NEXT: s_lshl_b32 s34, s12, 16
+; SI-NEXT: s_and_b32 s93, s14, 0xffff0000
+; SI-NEXT: s_lshl_b32 s30, s14, 16
+; SI-NEXT: s_and_b32 s91, s40, 0xffff0000
+; SI-NEXT: s_lshl_b32 s94, s40, 16
+; SI-NEXT: s_and_b32 s88, s42, 0xffff0000
+; SI-NEXT: s_lshl_b32 s92, s42, 16
+; SI-NEXT: s_and_b32 s78, s44, 0xffff0000
+; SI-NEXT: s_lshl_b32 s90, s44, 16
+; SI-NEXT: s_and_b32 s76, s46, 0xffff0000
+; SI-NEXT: s_lshl_b32 s28, s46, 16
+; SI-NEXT: s_and_b32 s74, s56, 0xffff0000
+; SI-NEXT: s_lshl_b32 s26, s56, 16
+; SI-NEXT: s_and_b32 s72, s58, 0xffff0000
+; SI-NEXT: s_lshl_b32 s24, s58, 16
+; SI-NEXT: s_and_b32 s61, vcc_hi, 0xffff0000
+; SI-NEXT: s_lshl_b32 s22, vcc_hi, 16
+; SI-NEXT: s_and_b32 s17, vcc_lo, 0xffff0000
+; SI-NEXT: s_lshl_b32 s20, vcc_lo, 16
+; SI-NEXT: s_and_b32 s60, s5, 0xffff0000
+; SI-NEXT: s_lshl_b32 s19, s5, 16
+; SI-NEXT: s_and_b32 s5, s4, 0xffff0000
+; SI-NEXT: s_lshl_b32 s4, s4, 16
+; SI-NEXT: v_writelane_b32 v41, s25, 37
; SI-NEXT: .LBB107_5: ; %end
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86
-; SI-NEXT: v_readlane_b32 s4, v41, 25
+; SI-NEXT: v_readlane_b32 s6, v41, 32
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT: v_readlane_b32 s6, v41, 31
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_readlane_b32 s6, v41, 34
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96
-; SI-NEXT: v_readlane_b32 s4, v41, 26
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT: v_readlane_b32 s6, v41, 33
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
+; SI-NEXT: v_readlane_b32 s6, v41, 36
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99
-; SI-NEXT: v_readlane_b32 s4, v41, 27
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6
+; SI-NEXT: v_readlane_b32 s6, v41, 35
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
+; SI-NEXT: v_readlane_b32 s6, v41, 37
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97
-; SI-NEXT: v_readlane_b32 s4, v41, 28
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92
-; SI-NEXT: v_readlane_b32 s4, v41, 29
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s73
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94
-; SI-NEXT: v_readlane_b32 s4, v41, 30
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s98
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95
-; SI-NEXT: v_readlane_b32 s4, v41, 31
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93
-; SI-NEXT: v_readlane_b32 s4, v41, 32
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s96
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30
-; SI-NEXT: v_readlane_b32 s4, v41, 33
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s81
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31
-; SI-NEXT: v_readlane_b32 s4, v41, 34
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34
-; SI-NEXT: v_readlane_b32 s4, v41, 35
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
-; SI-NEXT: v_readlane_b32 s4, v41, 36
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s69
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s65
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s55
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s39
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s31
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s95
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s93
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s91
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: v_readlane_b32 s99, v40, 35
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 3e96ab1d597d6..5800414be7476 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -2439,14 +2439,14 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -2458,14 +2458,18 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB22_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; implicit-def: $vgpr11
@@ -2477,30 +2481,34 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB22_2
; SI-NEXT: .LBB22_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2513,12 +2521,12 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB22_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -2529,15 +2537,15 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -2547,15 +2555,15 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -2565,15 +2573,15 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -2583,9 +2591,9 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB22_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -2848,50 +2856,58 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s24, 0
-; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
; SI-NEXT: s_cbranch_scc0 .LBB23_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v12
; SI-NEXT: s_cbranch_execnz .LBB23_3
; SI-NEXT: .LBB23_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: .LBB23_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB23_4:
@@ -2906,7 +2922,7 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB23_4
; VI-NEXT: .LBB23_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s19, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -2914,7 +2930,7 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -2922,17 +2938,17 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v4, v2, 16, 1
@@ -2940,17 +2956,17 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -2958,15 +2974,15 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -2976,9 +2992,9 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB23_3:
; VI-NEXT: s_branch .LBB23_2
@@ -6897,14 +6913,14 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -6916,14 +6932,18 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB46_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; implicit-def: $vgpr11
@@ -6935,30 +6955,34 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB46_2
; SI-NEXT: .LBB46_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -6971,12 +6995,12 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB46_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -6987,15 +7011,15 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -7005,15 +7029,15 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -7023,15 +7047,15 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -7041,9 +7065,9 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB46_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -7306,50 +7330,58 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s24, 0
-; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v12
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: .LBB47_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB47_4:
@@ -7364,7 +7396,7 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB47_4
; VI-NEXT: .LBB47_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s19, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -7372,7 +7404,7 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -7380,17 +7412,17 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v4, v2, 16, 1
@@ -7398,17 +7430,17 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -7416,15 +7448,15 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -7434,9 +7466,9 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB47_3:
; VI-NEXT: s_branch .LBB47_2
@@ -11003,14 +11035,14 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -11022,49 +11054,57 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB66_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB66_2
; SI-NEXT: .LBB66_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -11077,12 +11117,12 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB66_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -11093,15 +11133,15 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -11111,15 +11151,15 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -11129,15 +11169,15 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -11147,9 +11187,9 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB66_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -11412,50 +11452,58 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s24, 0
-; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
; SI-NEXT: s_cbranch_scc0 .LBB67_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v12
; SI-NEXT: s_cbranch_execnz .LBB67_3
; SI-NEXT: .LBB67_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: .LBB67_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB67_4:
@@ -11470,7 +11518,7 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB67_4
; VI-NEXT: .LBB67_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s19, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -11478,7 +11526,7 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -11486,17 +11534,17 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v4, v2, 16, 1
@@ -11504,17 +11552,17 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -11522,15 +11570,15 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -11540,9 +11588,9 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB67_3:
; VI-NEXT: s_branch .LBB67_2
@@ -14695,14 +14743,14 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -14714,14 +14762,18 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB82_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: ; implicit-def: $vgpr13
; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; implicit-def: $vgpr11
@@ -14733,30 +14785,34 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB82_2
; SI-NEXT: .LBB82_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -14769,12 +14825,12 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB82_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -14785,15 +14841,15 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -14803,15 +14859,15 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -14821,15 +14877,15 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -14839,9 +14895,9 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB82_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -15104,50 +15160,58 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s24, 0
-; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23
; SI-NEXT: s_cbranch_scc0 .LBB83_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v12
; SI-NEXT: s_cbranch_execnz .LBB83_3
; SI-NEXT: .LBB83_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: .LBB83_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB83_4:
@@ -15162,7 +15226,7 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB83_4
; VI-NEXT: .LBB83_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s19, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -15170,7 +15234,7 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -15178,17 +15242,17 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v4, v2, 16, 1
@@ -15196,17 +15260,17 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
@@ -15214,15 +15278,15 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -15232,9 +15296,9 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB83_3:
; VI-NEXT: s_branch .LBB83_2
@@ -18024,32 +18088,36 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB94_2
; SI-NEXT: .LBB94_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v4, v2, v5
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v8
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -18129,14 +18197,14 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v7, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v6, 16
-; VI-NEXT: v_alignbit_b32 v1, v1, v5, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB94_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -18426,32 +18494,36 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10
; SI-NEXT: s_cbranch_execnz .LBB95_3
; SI-NEXT: .LBB95_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v4, v2, v5
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v8
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
; SI-NEXT: .LBB95_3: ; %end
@@ -18489,12 +18561,11 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; VI-NEXT: v_add_f32_e32 v1, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
@@ -18506,12 +18577,11 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; VI-NEXT: v_add_f32_e32 v2, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
@@ -18523,12 +18593,11 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_bfe_u32 v7, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; VI-NEXT: v_add_f32_e32 v3, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
; VI-NEXT: v_bfe_u32 v8, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
@@ -18543,11 +18612,14 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16
-; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16
-; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16
-; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
+; VI-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB95_3:
; VI-NEXT: s_branch .LBB95_2
@@ -21043,14 +21115,14 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v7, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v6, 16
-; VI-NEXT: v_alignbit_b32 v1, v1, v5, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB102_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -21411,12 +21483,11 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; VI-NEXT: v_add_f32_e32 v1, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
@@ -21428,12 +21499,11 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; VI-NEXT: v_add_f32_e32 v2, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
@@ -21445,12 +21515,11 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i
; VI-NEXT: v_bfe_u32 v7, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; VI-NEXT: v_add_f32_e32 v3, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
; VI-NEXT: v_bfe_u32 v8, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
@@ -21465,11 +21534,14 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16
-; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16
-; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16
-; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
+; VI-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB103_3:
; VI-NEXT: s_branch .LBB103_2
@@ -23438,14 +23510,14 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v0
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v18, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v6
; SI-NEXT: v_mul_f32_e32 v17, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
@@ -23472,59 +23544,71 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB108_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17
-; SI-NEXT: v_alignbit_b32 v0, v0, v20, 16
-; SI-NEXT: v_alignbit_b32 v4, v6, v18, 16
-; SI-NEXT: v_alignbit_b32 v8, v5, v23, 16
-; SI-NEXT: v_alignbit_b32 v12, v14, v21, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v23
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v8, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
+; SI-NEXT: v_or_b32_e32 v12, v5, v6
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24
; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16
; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17
; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12
+; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17
; SI-NEXT: ; implicit-def: $vgpr23
; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr19
; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB108_2
; SI-NEXT: .LBB108_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v8, v0, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v12, v0, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19
-; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
@@ -23578,91 +23662,95 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB108_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19
+; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
; VI-NEXT: s_movk_i32 s6, 0x7fff
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v19, v1, v0, 16
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18
-; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
-; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18
-; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v18, v1, v0, 16
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; VI-NEXT: v_or_b32_e32 v1, v19, v0
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
-; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17
-; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v17, v1, v0, 16
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16
-; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
-; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
-; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v16, v1, v0, 16
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17]
-; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19]
-; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18
+; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v18
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v17
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
+; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v4
+; VI-NEXT: v_or_b32_e32 v6, v17, v3
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v10, v4, v5, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v16
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10
+; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v4
+; VI-NEXT: v_or_b32_e32 v0, v18, v0
+; VI-NEXT: v_or_b32_e32 v5, v16, v3
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[5:6]
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v5
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v7
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8
; VI-NEXT: .LBB108_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, v18
@@ -24090,60 +24178,72 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s24, 0
-; SI-NEXT: v_mul_f32_e64 v18, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v18, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v17, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v22, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v23, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v20, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v22, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v21, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v20, 1.0, s23
; SI-NEXT: s_cbranch_scc0 .LBB109_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20
-; SI-NEXT: v_alignbit_b32 v0, v0, v19, 16
-; SI-NEXT: v_alignbit_b32 v4, v6, v17, 16
-; SI-NEXT: v_alignbit_b32 v8, v5, v23, 16
-; SI-NEXT: v_alignbit_b32 v12, v14, v21, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v23
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v8, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
+; SI-NEXT: v_or_b32_e32 v12, v5, v6
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24
; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16
; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v20
; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12
+; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v20
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20
; SI-NEXT: s_cbranch_execnz .LBB109_3
; SI-NEXT: .LBB109_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v8, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v12, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
@@ -24181,142 +24281,144 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3
; VI-NEXT: s_cmp_lg_u32 s20, 0
; VI-NEXT: s_cbranch_scc0 .LBB109_3
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s10, s19, 24
-; VI-NEXT: s_lshr_b32 s11, s19, 16
-; VI-NEXT: s_lshr_b32 s13, s19, 8
-; VI-NEXT: s_lshr_b32 s12, s18, 16
-; VI-NEXT: s_lshr_b32 s14, s18, 8
+; VI-NEXT: s_lshr_b32 s22, s19, 24
+; VI-NEXT: s_lshr_b32 s21, s19, 16
+; VI-NEXT: s_lshr_b32 s11, s19, 8
+; VI-NEXT: s_lshr_b32 s23, s18, 16
+; VI-NEXT: s_lshr_b32 s13, s18, 8
; VI-NEXT: s_lshr_b32 s15, s17, 24
-; VI-NEXT: s_lshr_b32 s20, s17, 16
-; VI-NEXT: s_lshr_b32 s22, s17, 8
-; VI-NEXT: s_lshr_b32 s21, s16, 16
-; VI-NEXT: s_lshr_b32 s23, s16, 8
+; VI-NEXT: s_lshr_b32 s14, s17, 16
+; VI-NEXT: s_lshr_b32 s10, s17, 8
+; VI-NEXT: s_lshr_b32 s20, s16, 16
+; VI-NEXT: s_lshr_b32 s12, s16, 8
; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24
; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24
; VI-NEXT: s_cbranch_execnz .LBB109_4
; VI-NEXT: .LBB109_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v19, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
+; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000
+; VI-NEXT: v_add_f32_e32 v0, s4, v3
+; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: v_cndmask_b32_e32 v18, v1, v2, vcc
+; VI-NEXT: v_add_f32_e32 v1, s4, v3
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
+; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v1
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v18, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_or_b32_e32 v2, v17, v0
+; VI-NEXT: v_add_f32_e32 v0, s4, v3
+; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: s_lshl_b32 s4, s16, 16
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_add_f32_e32 v0, s4, v3
+; VI-NEXT: v_cndmask_b32_e32 v19, v1, v4, vcc
+; VI-NEXT: v_bfe_u32 v4, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v17, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; VI-NEXT: v_add_f32_e32 v4, s4, v3
+; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: s_lshl_b32 s4, s19, 16
+; VI-NEXT: v_cndmask_b32_e32 v7, v5, v6, vcc
+; VI-NEXT: v_add_f32_e32 v5, s4, v3
+; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v5
; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_add_f32_e32 v0, s4, v0
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT: v_or_b32_e32 v6, v16, v4
+; VI-NEXT: v_add_f32_e32 v4, s4, v3
+; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: s_lshl_b32 s4, s18, 16
+; VI-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_add_f32_e32 v3, s4, v3
+; VI-NEXT: v_cndmask_b32_e32 v10, v5, v8, vcc
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v16, v0, v1, 16
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17]
-; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19]
-; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; VI-NEXT: v_or_b32_e32 v1, v0, v1
+; VI-NEXT: v_or_b32_e32 v5, v8, v4
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[5:6]
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v5
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v7
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v18
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v19
; VI-NEXT: s_branch .LBB109_5
; VI-NEXT: .LBB109_3:
-; VI-NEXT: ; implicit-def: $sgpr23
-; VI-NEXT: ; implicit-def: $sgpr21
-; VI-NEXT: ; implicit-def: $sgpr4
-; VI-NEXT: ; implicit-def: $sgpr22
+; VI-NEXT: ; implicit-def: $sgpr12
; VI-NEXT: ; implicit-def: $sgpr20
-; VI-NEXT: ; implicit-def: $sgpr15
+; VI-NEXT: ; implicit-def: $sgpr4
+; VI-NEXT: ; implicit-def: $sgpr10
; VI-NEXT: ; implicit-def: $sgpr14
-; VI-NEXT: ; implicit-def: $sgpr12
-; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr15
; VI-NEXT: ; implicit-def: $sgpr13
+; VI-NEXT: ; implicit-def: $sgpr23
+; VI-NEXT: ; implicit-def: $sgpr6
; VI-NEXT: ; implicit-def: $sgpr11
-; VI-NEXT: ; implicit-def: $sgpr10
+; VI-NEXT: ; implicit-def: $sgpr21
+; VI-NEXT: ; implicit-def: $sgpr22
; VI-NEXT: s_branch .LBB109_2
; VI-NEXT: .LBB109_4:
-; VI-NEXT: v_mov_b32_e32 v18, s16
-; VI-NEXT: v_mov_b32_e32 v19, s17
-; VI-NEXT: v_mov_b32_e32 v16, s18
-; VI-NEXT: v_mov_b32_e32 v17, s19
-; VI-NEXT: v_mov_b32_e32 v1, s23
-; VI-NEXT: v_mov_b32_e32 v2, s21
-; VI-NEXT: v_mov_b32_e32 v5, s22
-; VI-NEXT: v_mov_b32_e32 v6, s20
+; VI-NEXT: v_mov_b32_e32 v8, s18
+; VI-NEXT: v_mov_b32_e32 v10, s23
+; VI-NEXT: v_mov_b32_e32 v16, s19
+; VI-NEXT: v_mov_b32_e32 v15, s22
+; VI-NEXT: v_mov_b32_e32 v14, s21
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v2, s20
+; VI-NEXT: v_mov_b32_e32 v17, s17
; VI-NEXT: v_mov_b32_e32 v7, s15
-; VI-NEXT: v_mov_b32_e32 v9, s14
-; VI-NEXT: v_mov_b32_e32 v10, s12
-; VI-NEXT: v_mov_b32_e32 v13, s13
-; VI-NEXT: v_mov_b32_e32 v14, s11
-; VI-NEXT: v_mov_b32_e32 v15, s10
+; VI-NEXT: v_mov_b32_e32 v6, s14
+; VI-NEXT: v_mov_b32_e32 v9, s13
+; VI-NEXT: v_mov_b32_e32 v13, s11
+; VI-NEXT: v_mov_b32_e32 v1, s12
+; VI-NEXT: v_mov_b32_e32 v5, s10
; VI-NEXT: v_mov_b32_e32 v11, s6
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: .LBB109_5: ; %end
-; VI-NEXT: v_mov_b32_e32 v0, v18
-; VI-NEXT: v_mov_b32_e32 v4, v19
-; VI-NEXT: v_mov_b32_e32 v8, v16
-; VI-NEXT: v_mov_b32_e32 v12, v17
+; VI-NEXT: v_mov_b32_e32 v4, v17
+; VI-NEXT: v_mov_b32_e32 v12, v16
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v8bf16_to_v16i8_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index f8ffaa456c2b3..599b1d6336ec3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -3327,22 +3327,22 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -3354,89 +3354,105 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB22_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16
-; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB22_2
; SI-NEXT: .LBB22_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3449,12 +3465,12 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB22_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -3465,15 +3481,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -3483,15 +3499,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -3501,15 +3517,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4
+; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -3519,15 +3535,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -3537,15 +3553,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -3555,15 +3571,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -3573,15 +3589,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -3591,9 +3607,9 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB22_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -4054,90 +4070,106 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
; SI-NEXT: s_cbranch_scc0 .LBB23_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v24
; SI-NEXT: s_cbranch_execnz .LBB23_3
; SI-NEXT: .LBB23_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: .LBB23_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB23_4:
@@ -4152,7 +4184,7 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB23_4
; VI-NEXT: .LBB23_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s23, 16
+; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -4160,7 +4192,7 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s23, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -4168,17 +4200,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s22, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -4186,17 +4218,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s21, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -4204,17 +4236,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s20, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -4222,17 +4254,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -4240,17 +4272,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v8, v2, 16, 1
@@ -4258,17 +4290,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v8, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; VI-NEXT: v_add_f32_e32 v8, s4, v0
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
@@ -4276,15 +4308,15 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v8, s4, v0
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -4294,9 +4326,9 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB23_3:
; VI-NEXT: s_branch .LBB23_2
@@ -10340,22 +10372,22 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -10367,89 +10399,105 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB46_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16
-; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB46_2
; SI-NEXT: .LBB46_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -10462,12 +10510,12 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB46_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -10478,15 +10526,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -10496,15 +10544,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -10514,15 +10562,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4
+; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -10532,15 +10580,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -10550,15 +10598,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -10568,15 +10616,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -10586,15 +10634,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -10604,9 +10652,9 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB46_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -11067,90 +11115,106 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v24
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: .LBB47_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB47_4:
@@ -11165,7 +11229,7 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB47_4
; VI-NEXT: .LBB47_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s23, 16
+; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -11173,7 +11237,7 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s23, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -11181,17 +11245,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s22, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -11199,17 +11263,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s21, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -11217,17 +11281,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s20, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -11235,17 +11299,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -11253,17 +11317,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v8, v2, 16, 1
@@ -11271,17 +11335,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v8, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; VI-NEXT: v_add_f32_e32 v8, s4, v0
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
@@ -11289,15 +11353,15 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v8, s4, v0
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -11307,9 +11371,9 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB47_3:
; VI-NEXT: s_branch .LBB47_2
@@ -16917,22 +16981,22 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -16944,89 +17008,105 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB66_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16
-; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB66_2
; SI-NEXT: .LBB66_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -17039,12 +17119,12 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB66_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -17055,15 +17135,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -17073,15 +17153,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -17091,15 +17171,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4
+; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -17109,15 +17189,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -17127,15 +17207,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -17145,15 +17225,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -17163,15 +17243,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -17181,9 +17261,9 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB66_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -17644,90 +17724,106 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
; SI-NEXT: s_cbranch_scc0 .LBB67_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v24
; SI-NEXT: s_cbranch_execnz .LBB67_3
; SI-NEXT: .LBB67_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: .LBB67_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB67_4:
@@ -17742,7 +17838,7 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB67_4
; VI-NEXT: .LBB67_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s23, 16
+; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -17750,7 +17846,7 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s23, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -17758,17 +17854,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s22, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -17776,17 +17872,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s21, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -17794,17 +17890,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s20, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -17812,17 +17908,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -17830,17 +17926,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v8, v2, 16, 1
@@ -17848,17 +17944,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v8, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; VI-NEXT: v_add_f32_e32 v8, s4, v0
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
@@ -17866,15 +17962,15 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v8, s4, v0
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -17884,9 +17980,9 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB67_3:
; VI-NEXT: s_branch .LBB67_2
@@ -22955,22 +23051,22 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -22982,89 +23078,105 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB82_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16
-; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr24
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr18
; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB82_2
; SI-NEXT: .LBB82_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -23077,12 +23189,12 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB82_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -23093,15 +23205,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -23111,15 +23223,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -23129,15 +23241,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4
+; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -23147,15 +23259,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -23165,15 +23277,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -23183,15 +23295,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -23201,15 +23313,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -23219,9 +23331,9 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB82_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -23682,90 +23794,106 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
; SI-NEXT: s_cbranch_scc0 .LBB83_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8
-; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v24
; SI-NEXT: s_cbranch_execnz .LBB83_3
; SI-NEXT: .LBB83_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: .LBB83_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB83_4:
@@ -23780,7 +23908,7 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB83_4
; VI-NEXT: .LBB83_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s23, 16
+; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -23788,7 +23916,7 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s23, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -23796,17 +23924,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s22, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -23814,17 +23942,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s21, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -23832,17 +23960,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s20, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -23850,17 +23978,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -23868,17 +23996,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v8, v2, 16, 1
@@ -23886,17 +24014,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v8, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1
; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; VI-NEXT: v_add_f32_e32 v8, s4, v0
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
@@ -23904,15 +24032,15 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v8, s4, v0
; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -23922,9 +24050,9 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB83_3:
; VI-NEXT: s_branch .LBB83_2
@@ -28535,58 +28663,66 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB94_2
; SI-NEXT: .LBB94_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26
-; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v4, v2, v5
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v8, v2, v9
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v12, v2, v13
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v16
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
@@ -28728,26 +28864,26 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
; VI-NEXT: v_bfe_u32 v16, v7, 16, 1
; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v7
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
+; VI-NEXT: v_add_u32_e32 v16, vcc, s6, v16
; VI-NEXT: v_or_b32_e32 v17, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v15, 16
-; VI-NEXT: v_alignbit_b32 v6, v6, v14, 16
-; VI-NEXT: v_alignbit_b32 v5, v5, v13, 16
-; VI-NEXT: v_alignbit_b32 v4, v4, v12, 16
-; VI-NEXT: v_alignbit_b32 v3, v3, v11, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v10, 16
-; VI-NEXT: v_alignbit_b32 v1, v1, v9, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB94_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -29257,58 +29393,66 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22
; SI-NEXT: s_cbranch_execnz .LBB95_3
; SI-NEXT: .LBB95_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26
-; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v4, v2, v5
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v8, v2, v9
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v12, v2, v13
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v16
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
@@ -29356,83 +29500,82 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: v_add_f32_e32 v2, s4, v1
+; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_add_f32_e32 v2, s4, v1
+; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_lshl_b32 s4, s18, 16
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_add_f32_e32 v2, s4, v1
+; VI-NEXT: v_cndmask_b32_e32 v10, v3, v4, vcc
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; VI-NEXT: v_add_f32_e32 v3, s4, v1
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc
+; VI-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_lshl_b32 s4, s18, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: v_add_f32_e32 v4, s4, v1
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s20, 16
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v4, s4, v1
-; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc
+; VI-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: s_lshl_b32 s4, s19, 16
+; VI-NEXT: s_lshl_b32 s5, s22, 16
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_add_f32_e32 v5, s4, v1
-; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: v_add_f32_e32 v5, s4, v1
-; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc
+; VI-NEXT: v_add_f32_e32 v5, s5, v1
; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: s_lshl_b32 s4, s20, 16
+; VI-NEXT: s_and_b32 s5, s22, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT: v_add_f32_e32 v6, s4, v1
-; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
-; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT: s_lshl_b32 s5, s22, 16
-; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_add_f32_e32 v6, s5, v1
-; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc
; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT: s_and_b32 s5, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc
-; VI-NEXT: v_add_f32_e32 v7, s5, v1
-; VI-NEXT: v_bfe_u32 v12, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7
-; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc
; VI-NEXT: s_lshl_b32 s5, s23, 16
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7
+; VI-NEXT: v_cndmask_b32_e32 v6, v7, v13, vcc
; VI-NEXT: v_add_f32_e32 v7, s5, v1
; VI-NEXT: v_bfe_u32 v13, v7, 16, 1
; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7
@@ -29449,44 +29592,45 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16
-; VI-NEXT: v_add_f32_e32 v12, s4, v1
-; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16
-; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
-; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f32_e32 v5, s4, v1
+; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_bfe_u32 v13, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v5
; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
-; VI-NEXT: v_add_f32_e32 v13, s4, v1
-; VI-NEXT: v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13
+; VI-NEXT: v_or_b32_e32 v14, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_add_f32_e32 v5, s4, v1
+; VI-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc
+; VI-NEXT: v_bfe_u32 v14, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v5
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_add_f32_e32 v1, s4, v1
-; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
+; VI-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc
; VI-NEXT: v_bfe_u32 v14, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16
-; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16
-; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16
-; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16
-; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_or_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13
+; VI-NEXT: v_or_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
+; VI-NEXT: v_or_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB95_3:
; VI-NEXT: s_branch .LBB95_2
@@ -33634,26 +33778,26 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc
; VI-NEXT: v_bfe_u32 v16, v7, 16, 1
; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v7
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
+; VI-NEXT: v_add_u32_e32 v16, vcc, s6, v16
; VI-NEXT: v_or_b32_e32 v17, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v15, 16
-; VI-NEXT: v_alignbit_b32 v6, v6, v14, 16
-; VI-NEXT: v_alignbit_b32 v5, v5, v13, 16
-; VI-NEXT: v_alignbit_b32 v4, v4, v12, 16
-; VI-NEXT: v_alignbit_b32 v3, v3, v11, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v10, 16
-; VI-NEXT: v_alignbit_b32 v1, v1, v9, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16
+; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB102_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -34277,83 +34421,82 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: v_add_f32_e32 v2, s4, v1
+; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_add_f32_e32 v2, s4, v1
+; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_lshl_b32 s4, s18, 16
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_add_f32_e32 v2, s4, v1
+; VI-NEXT: v_cndmask_b32_e32 v10, v3, v4, vcc
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; VI-NEXT: v_add_f32_e32 v3, s4, v1
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_add_f32_e32 v3, s4, v1
-; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc
+; VI-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_lshl_b32 s4, s18, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: v_add_f32_e32 v4, s4, v1
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s20, 16
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_add_f32_e32 v4, s4, v1
-; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc
+; VI-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: s_lshl_b32 s4, s19, 16
+; VI-NEXT: s_lshl_b32 s5, s22, 16
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_add_f32_e32 v5, s4, v1
+; VI-NEXT: v_add_f32_e32 v5, s5, v1
; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: v_add_f32_e32 v5, s4, v1
-; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc
-; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: s_lshl_b32 s4, s20, 16
+; VI-NEXT: s_and_b32 s5, s22, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT: v_add_f32_e32 v6, s4, v1
-; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
-; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT: s_lshl_b32 s5, s22, 16
-; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_add_f32_e32 v6, s5, v1
-; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc
; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT: s_and_b32 s5, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc
-; VI-NEXT: v_add_f32_e32 v7, s5, v1
-; VI-NEXT: v_bfe_u32 v12, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7
-; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc
; VI-NEXT: s_lshl_b32 s5, s23, 16
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7
+; VI-NEXT: v_cndmask_b32_e32 v6, v7, v13, vcc
; VI-NEXT: v_add_f32_e32 v7, s5, v1
; VI-NEXT: v_bfe_u32 v13, v7, 16, 1
; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7
@@ -34370,44 +34513,45 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16
-; VI-NEXT: v_add_f32_e32 v12, s4, v1
-; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16
-; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
-; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f32_e32 v5, s4, v1
+; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_bfe_u32 v13, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v5
; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
-; VI-NEXT: v_add_f32_e32 v13, s4, v1
-; VI-NEXT: v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13
+; VI-NEXT: v_or_b32_e32 v14, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_add_f32_e32 v5, s4, v1
+; VI-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc
+; VI-NEXT: v_bfe_u32 v14, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v5
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_add_f32_e32 v1, s4, v1
-; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
+; VI-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc
; VI-NEXT: v_bfe_u32 v14, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1
; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16
-; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16
-; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16
-; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16
-; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_or_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13
+; VI-NEXT: v_or_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
+; VI-NEXT: v_or_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB103_3:
; VI-NEXT: s_branch .LBB103_2
@@ -37698,22 +37842,22 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v38, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v52, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v55, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v15
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
@@ -37756,22 +37900,30 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB108_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39
-; SI-NEXT: v_alignbit_b32 v8, v5, v48, 16
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v33
-; SI-NEXT: v_alignbit_b32 v16, v5, v52, 16
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v37
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49
-; SI-NEXT: v_alignbit_b32 v0, v0, v36, 16
-; SI-NEXT: v_alignbit_b32 v4, v6, v34, 16
-; SI-NEXT: v_alignbit_b32 v12, v14, v38, 16
-; SI-NEXT: v_alignbit_b32 v20, v22, v50, 16
-; SI-NEXT: v_alignbit_b32 v24, v5, v55, 16
-; SI-NEXT: v_alignbit_b32 v28, v30, v53, 16
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v48
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v39
+; SI-NEXT: v_or_b32_e32 v8, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v12, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v52
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
+; SI-NEXT: v_or_b32_e32 v16, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v50
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36
+; SI-NEXT: v_or_b32_e32 v20, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v55
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32
+; SI-NEXT: v_or_b32_e32 v24, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v53
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v49
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
+; SI-NEXT: v_or_b32_e32 v28, v5, v6
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
@@ -37784,81 +37936,97 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24
; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16
; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8
-; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v33
; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12
-; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v37
; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20
-; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v49
; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v33
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v35
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35
+; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v49
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49
+; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr32
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB108_2
; SI-NEXT: .LBB108_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v24, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51
-; SI-NEXT: v_alignbit_b32 v28, v30, v0, 16
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v28, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v16, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50
-; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39
-; SI-NEXT: v_alignbit_b32 v20, v22, v0, 16
+; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v20, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v8, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v12, v0, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35
-; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
@@ -37885,10 +38053,10 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
; VI-LABEL: bitcast_v16bf16_to_v32i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v33, v5
-; VI-NEXT: v_mov_b32_e32 v32, v4
-; VI-NEXT: v_mov_b32_e32 v35, v3
-; VI-NEXT: v_mov_b32_e32 v34, v2
+; VI-NEXT: v_mov_b32_e32 v35, v5
+; VI-NEXT: v_mov_b32_e32 v34, v4
+; VI-NEXT: v_mov_b32_e32 v33, v3
+; VI-NEXT: v_mov_b32_e32 v32, v2
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; VI-NEXT: ; implicit-def: $vgpr38
; VI-NEXT: ; implicit-def: $vgpr2
@@ -37923,39 +38091,39 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7
; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6
; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33
-; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32
-; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34
+; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35
+; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34
+; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v33
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v33
+; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v33
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v32
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v32
; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1
; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0
; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7]
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33]
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35]
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35]
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[32:33]
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
; VI-NEXT: .LBB108_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB108_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
; VI-NEXT: s_movk_i32 s6, 0x7fff
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
@@ -37963,165 +38131,173 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; VI-NEXT: v_or_b32_e32 v3, v1, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_bfe_u32 v4, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2
+; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; VI-NEXT: v_cndmask_b32_e32 v16, v4, v5, vcc
+; VI-NEXT: v_bfe_u32 v4, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v33
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_cndmask_b32_e32 v10, v5, v9, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v33
+; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; VI-NEXT: v_bfe_u32 v9, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
+; VI-NEXT: v_or_b32_e32 v5, v33, v4
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v32
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_bfe_u32 v9, v4, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v4
+; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_cndmask_b32_e32 v24, v9, v11, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_add_u32_e32 v11, vcc, s6, v11
+; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v35
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_add_u32_e32 v11, vcc, s6, v11
+; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_cndmask_b32_e32 v15, v11, v12, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v35
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v12, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11
+; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v11
+; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v34
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_bfe_u32 v12, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11
+; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v34
+; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; VI-NEXT: v_cndmask_b32_e32 v18, v12, v13, vcc
+; VI-NEXT: v_bfe_u32 v12, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11
+; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v15
+; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; VI-NEXT: v_or_b32_e32 v12, v35, v9
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v18
+; VI-NEXT: v_or_b32_e32 v11, v34, v9
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v13, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13
+; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; VI-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v22, v13, v14, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
+; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_cndmask_b32_e32 v7, v9, v13, vcc
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
+; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; VI-NEXT: v_bfe_u32 v13, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13
+; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; VI-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_bfe_u32 v9, v6, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v23, v13, v14, vcc
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_cndmask_b32_e32 v6, v9, v13, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v22
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v35
-; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35
-; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v35, v3, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v34
-; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34
-; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v34, v3, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v33
-; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33
-; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v33, v3, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v32
-; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32
-; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v32, v3, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
-; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
-; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16
-; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7]
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33]
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35]
-; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33
-; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32
-; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34
-; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1
-; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v24
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; VI-NEXT: v_or_b32_e32 v14, v7, v9
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23
+; VI-NEXT: v_or_b32_e32 v2, v0, v2
+; VI-NEXT: v_or_b32_e32 v4, v32, v4
+; VI-NEXT: v_or_b32_e32 v13, v6, v9
+; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v13
+; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[13:14]
+; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v12
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
+; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v5
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v4
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[4:5]
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[2:3]
+; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v14
+; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v2
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v22
+; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v23
+; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v15
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v15
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v10
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v24
+; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v8
+; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v16
; VI-NEXT: .LBB108_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_mov_b32_e32 v4, v1
-; VI-NEXT: v_mov_b32_e32 v8, v34
-; VI-NEXT: v_mov_b32_e32 v12, v35
-; VI-NEXT: v_mov_b32_e32 v16, v32
-; VI-NEXT: v_mov_b32_e32 v20, v33
+; VI-NEXT: v_mov_b32_e32 v8, v32
+; VI-NEXT: v_mov_b32_e32 v12, v33
+; VI-NEXT: v_mov_b32_e32 v16, v34
+; VI-NEXT: v_mov_b32_e32 v20, v35
; VI-NEXT: v_mov_b32_e32 v24, v6
; VI-NEXT: v_mov_b32_e32 v28, v7
; VI-NEXT: v_mov_b32_e32 v1, v38
@@ -38871,40 +39047,48 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v38, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v39, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v38, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v37, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v51, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v48, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v50, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v49, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v54, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v48, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v1
+; SI-NEXT: v_mul_f32_e64 v54, 1.0, s29
; SI-NEXT: v_mul_f32_e32 v53, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v52, 1.0, v1
; SI-NEXT: s_cbranch_scc0 .LBB109_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT: v_alignbit_b32 v8, v5, v39, 16
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v50
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v36
-; SI-NEXT: v_alignbit_b32 v16, v5, v51, 16
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v52
-; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT: v_alignbit_b32 v4, v6, v33, 16
-; SI-NEXT: v_alignbit_b32 v12, v14, v37, 16
-; SI-NEXT: v_alignbit_b32 v20, v22, v49, 16
-; SI-NEXT: v_alignbit_b32 v24, v5, v55, 16
-; SI-NEXT: v_alignbit_b32 v28, v30, v53, 16
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v8, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36
+; SI-NEXT: v_or_b32_e32 v12, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50
+; SI-NEXT: v_or_b32_e32 v16, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v49
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
+; SI-NEXT: v_or_b32_e32 v20, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v55
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32
+; SI-NEXT: v_or_b32_e32 v24, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v53
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v52
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
+; SI-NEXT: v_or_b32_e32 v28, v5, v6
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
@@ -38917,64 +39101,80 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24
; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16
; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8
-; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v36
; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12
-; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v48
; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20
-; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v52
; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28
+; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v36
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v36
+; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v48
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v48
+; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v52
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v52
; SI-NEXT: s_cbranch_execnz .LBB109_3
; SI-NEXT: .LBB109_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v24, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v28, v30, v0, 16
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v28, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v16, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v20, v22, v0, 16
+; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v20, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v8, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v12, v0, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
@@ -39038,270 +39238,274 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
; VI-NEXT: s_cmp_lg_u32 s24, 0
; VI-NEXT: s_cbranch_scc0 .LBB109_3
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s14, s23, 24
-; VI-NEXT: s_lshr_b32 s15, s23, 16
+; VI-NEXT: s_lshr_b32 s58, s23, 24
+; VI-NEXT: s_lshr_b32 s57, s23, 16
; VI-NEXT: s_lshr_b32 s25, s23, 8
-; VI-NEXT: s_lshr_b32 s24, s22, 16
-; VI-NEXT: s_lshr_b32 s26, s22, 8
-; VI-NEXT: s_lshr_b32 s27, s21, 24
-; VI-NEXT: s_lshr_b32 s28, s21, 16
-; VI-NEXT: s_lshr_b32 s40, s21, 8
-; VI-NEXT: s_lshr_b32 s29, s20, 16
-; VI-NEXT: s_lshr_b32 s41, s20, 8
-; VI-NEXT: s_lshr_b32 s42, s19, 24
+; VI-NEXT: s_lshr_b32 s59, s22, 16
+; VI-NEXT: s_lshr_b32 s29, s22, 8
+; VI-NEXT: s_lshr_b32 s47, s21, 24
+; VI-NEXT: s_lshr_b32 s46, s21, 16
+; VI-NEXT: s_lshr_b32 s24, s21, 8
+; VI-NEXT: s_lshr_b32 s56, s20, 16
+; VI-NEXT: s_lshr_b32 s28, s20, 8
+; VI-NEXT: s_lshr_b32 s44, s19, 24
; VI-NEXT: s_lshr_b32 s43, s19, 16
-; VI-NEXT: s_lshr_b32 s45, s19, 8
-; VI-NEXT: s_lshr_b32 s44, s18, 16
-; VI-NEXT: s_lshr_b32 s46, s18, 8
-; VI-NEXT: s_lshr_b32 s47, s17, 24
-; VI-NEXT: s_lshr_b32 s56, s17, 16
-; VI-NEXT: s_lshr_b32 s58, s17, 8
-; VI-NEXT: s_lshr_b32 s57, s16, 16
-; VI-NEXT: s_lshr_b32 s59, s16, 8
+; VI-NEXT: s_lshr_b32 s15, s19, 8
+; VI-NEXT: s_lshr_b32 s45, s18, 16
+; VI-NEXT: s_lshr_b32 s27, s18, 8
+; VI-NEXT: s_lshr_b32 s41, s17, 24
+; VI-NEXT: s_lshr_b32 s40, s17, 16
+; VI-NEXT: s_lshr_b32 s14, s17, 8
+; VI-NEXT: s_lshr_b32 s42, s16, 16
+; VI-NEXT: s_lshr_b32 s26, s16, 8
; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24
; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24
; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24
; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24
; VI-NEXT: s_cbranch_execnz .LBB109_4
; VI-NEXT: .LBB109_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_mov_b32_e32 v2, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v0, s4, v2
-; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT: v_add_f32_e32 v0, s4, v1
+; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; VI-NEXT: v_add_f32_e32 v1, s4, v2
-; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: v_cndmask_b32_e32 v6, v2, v3, vcc
+; VI-NEXT: v_add_f32_e32 v2, s4, v1
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v3, v35, v0
+; VI-NEXT: v_add_f32_e32 v0, s4, v1
+; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; VI-NEXT: v_add_f32_e32 v0, s4, v2
-; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s4, v2
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v0, v3, v0, 16
-; VI-NEXT: v_add_f32_e32 v3, s4, v2
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_f32_e32 v0, s4, v1
+; VI-NEXT: v_cndmask_b32_e32 v36, v2, v4, vcc
+; VI-NEXT: v_bfe_u32 v4, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s4, v2
+; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; VI-NEXT: v_add_f32_e32 v4, s4, v1
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v9, v4, v3, 16
-; VI-NEXT: v_add_f32_e32 v3, s4, v2
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: s_lshl_b32 s4, s19, 16
+; VI-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc
+; VI-NEXT: v_add_f32_e32 v5, s4, v1
+; VI-NEXT: v_bfe_u32 v8, v5, 16, 1
+; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5
; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s4, v2
-; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_or_b32_e32 v5, v34, v4
+; VI-NEXT: v_add_f32_e32 v4, s4, v1
+; VI-NEXT: v_bfe_u32 v8, v4, 16, 1
+; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v4
+; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v8, v4, v3, 16
-; VI-NEXT: v_add_f32_e32 v3, s4, v2
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: s_lshl_b32 s4, s18, 16
+; VI-NEXT: v_cndmask_b32_e32 v10, v8, v9, vcc
+; VI-NEXT: v_add_f32_e32 v8, s4, v1
+; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
+; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s4, v2
-; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v17, v4, v3, 16
-; VI-NEXT: v_add_f32_e32 v3, s4, v2
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc
+; VI-NEXT: v_add_f32_e32 v9, s4, v1
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: s_lshl_b32 s4, s21, 16
+; VI-NEXT: v_cndmask_b32_e32 v15, v11, v12, vcc
+; VI-NEXT: v_add_f32_e32 v11, s4, v1
+; VI-NEXT: v_bfe_u32 v12, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11
+; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v15
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11
; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s4, v2
-; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v16, v4, v3, 16
-; VI-NEXT: v_add_f32_e32 v3, s4, v2
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_or_b32_e32 v12, v33, v9
+; VI-NEXT: v_add_f32_e32 v9, s4, v1
+; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
+; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: s_lshl_b32 s4, s20, 16
+; VI-NEXT: v_cndmask_b32_e32 v18, v11, v13, vcc
+; VI-NEXT: v_add_f32_e32 v11, s4, v1
+; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v18
+; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v11
; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s4, v2
-; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v25, v4, v3, 16
-; VI-NEXT: v_add_f32_e32 v3, s4, v2
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v11, v16, v9
+; VI-NEXT: v_add_f32_e32 v9, s4, v1
+; VI-NEXT: v_bfe_u32 v13, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: s_lshl_b32 s4, s23, 16
+; VI-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_add_f32_e32 v9, s4, v1
+; VI-NEXT: v_cndmask_b32_e32 v22, v13, v14, vcc
+; VI-NEXT: v_bfe_u32 v13, v9, 16, 1
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_add_f32_e32 v2, s4, v2
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_bfe_u32 v4, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v24, v2, v3, 16
-; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25]
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17]
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9]
-; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25
-; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24
-; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24
-; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17
-; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17
-; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0
+; VI-NEXT: v_cndmask_b32_e32 v9, v13, v14, vcc
+; VI-NEXT: v_add_f32_e32 v13, s4, v1
+; VI-NEXT: v_bfe_u32 v14, v13, 16, 1
+; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13
+; VI-NEXT: s_lshl_b32 s4, s22, 16
+; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
+; VI-NEXT: v_add_f32_e32 v1, s4, v1
+; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_bfe_u32 v13, v1, 16, 1
+; VI-NEXT: v_cndmask_b32_e32 v23, v14, v17, vcc
+; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v1
+; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
+; VI-NEXT: v_or_b32_e32 v14, 0x400000, v1
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v13, v14, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9
+; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; VI-NEXT: v_or_b32_e32 v14, v32, v1
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23
+; VI-NEXT: v_or_b32_e32 v2, v0, v2
+; VI-NEXT: v_or_b32_e32 v4, v8, v4
+; VI-NEXT: v_or_b32_e32 v13, v24, v1
+; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v13
+; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[13:14]
+; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v12
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
+; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v5
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v4
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[4:5]
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[2:3]
+; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v14
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v22
+; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v23
+; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v15
+; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v15
+; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v7
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v6
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v36
; VI-NEXT: s_branch .LBB109_5
; VI-NEXT: .LBB109_3:
-; VI-NEXT: ; implicit-def: $sgpr59
-; VI-NEXT: ; implicit-def: $sgpr57
+; VI-NEXT: ; implicit-def: $sgpr26
+; VI-NEXT: ; implicit-def: $sgpr42
; VI-NEXT: ; implicit-def: $sgpr4
-; VI-NEXT: ; implicit-def: $sgpr58
-; VI-NEXT: ; implicit-def: $sgpr56
-; VI-NEXT: ; implicit-def: $sgpr47
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr44
-; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr14
+; VI-NEXT: ; implicit-def: $sgpr40
+; VI-NEXT: ; implicit-def: $sgpr41
+; VI-NEXT: ; implicit-def: $sgpr27
; VI-NEXT: ; implicit-def: $sgpr45
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr15
; VI-NEXT: ; implicit-def: $sgpr43
-; VI-NEXT: ; implicit-def: $sgpr42
-; VI-NEXT: ; implicit-def: $sgpr41
-; VI-NEXT: ; implicit-def: $sgpr29
-; VI-NEXT: ; implicit-def: $sgpr8
-; VI-NEXT: ; implicit-def: $sgpr40
+; VI-NEXT: ; implicit-def: $sgpr44
; VI-NEXT: ; implicit-def: $sgpr28
-; VI-NEXT: ; implicit-def: $sgpr27
-; VI-NEXT: ; implicit-def: $sgpr26
+; VI-NEXT: ; implicit-def: $sgpr56
+; VI-NEXT: ; implicit-def: $sgpr8
; VI-NEXT: ; implicit-def: $sgpr24
+; VI-NEXT: ; implicit-def: $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr47
+; VI-NEXT: ; implicit-def: $sgpr29
+; VI-NEXT: ; implicit-def: $sgpr59
; VI-NEXT: ; implicit-def: $sgpr10
; VI-NEXT: ; implicit-def: $sgpr25
-; VI-NEXT: ; implicit-def: $sgpr15
-; VI-NEXT: ; implicit-def: $sgpr14
+; VI-NEXT: ; implicit-def: $sgpr57
+; VI-NEXT: ; implicit-def: $sgpr58
; VI-NEXT: s_branch .LBB109_2
; VI-NEXT: .LBB109_4:
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v1, s17
-; VI-NEXT: v_mov_b32_e32 v8, s18
-; VI-NEXT: v_mov_b32_e32 v9, s19
-; VI-NEXT: v_mov_b32_e32 v16, s20
-; VI-NEXT: v_mov_b32_e32 v17, s21
; VI-NEXT: v_mov_b32_e32 v24, s22
-; VI-NEXT: v_mov_b32_e32 v25, s23
-; VI-NEXT: v_mov_b32_e32 v35, s59
-; VI-NEXT: v_mov_b32_e32 v2, s57
-; VI-NEXT: v_mov_b32_e32 v5, s58
-; VI-NEXT: v_mov_b32_e32 v6, s56
-; VI-NEXT: v_mov_b32_e32 v7, s47
-; VI-NEXT: v_mov_b32_e32 v34, s46
-; VI-NEXT: v_mov_b32_e32 v10, s44
-; VI-NEXT: v_mov_b32_e32 v13, s45
+; VI-NEXT: v_mov_b32_e32 v26, s59
+; VI-NEXT: v_mov_b32_e32 v32, s23
+; VI-NEXT: v_mov_b32_e32 v31, s58
+; VI-NEXT: v_mov_b32_e32 v30, s57
+; VI-NEXT: v_mov_b32_e32 v16, s20
+; VI-NEXT: v_mov_b32_e32 v18, s56
+; VI-NEXT: v_mov_b32_e32 v33, s21
+; VI-NEXT: v_mov_b32_e32 v23, s47
+; VI-NEXT: v_mov_b32_e32 v22, s46
+; VI-NEXT: v_mov_b32_e32 v8, s18
+; VI-NEXT: v_mov_b32_e32 v10, s45
+; VI-NEXT: v_mov_b32_e32 v34, s19
+; VI-NEXT: v_mov_b32_e32 v15, s44
; VI-NEXT: v_mov_b32_e32 v14, s43
-; VI-NEXT: v_mov_b32_e32 v15, s42
-; VI-NEXT: v_mov_b32_e32 v33, s41
-; VI-NEXT: v_mov_b32_e32 v18, s29
-; VI-NEXT: v_mov_b32_e32 v21, s40
-; VI-NEXT: v_mov_b32_e32 v22, s28
-; VI-NEXT: v_mov_b32_e32 v23, s27
-; VI-NEXT: v_mov_b32_e32 v32, s26
-; VI-NEXT: v_mov_b32_e32 v26, s24
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v2, s42
+; VI-NEXT: v_mov_b32_e32 v35, s17
+; VI-NEXT: v_mov_b32_e32 v7, s41
+; VI-NEXT: v_mov_b32_e32 v6, s40
+; VI-NEXT: v_mov_b32_e32 v25, s29
; VI-NEXT: v_mov_b32_e32 v29, s25
-; VI-NEXT: v_mov_b32_e32 v30, s15
-; VI-NEXT: v_mov_b32_e32 v31, s14
+; VI-NEXT: v_mov_b32_e32 v17, s28
+; VI-NEXT: v_mov_b32_e32 v21, s24
+; VI-NEXT: v_mov_b32_e32 v9, s27
+; VI-NEXT: v_mov_b32_e32 v13, s15
+; VI-NEXT: v_mov_b32_e32 v1, s26
+; VI-NEXT: v_mov_b32_e32 v5, s14
; VI-NEXT: v_mov_b32_e32 v27, s10
; VI-NEXT: v_mov_b32_e32 v19, s8
; VI-NEXT: v_mov_b32_e32 v11, s6
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: .LBB109_5: ; %end
-; VI-NEXT: v_mov_b32_e32 v4, v1
-; VI-NEXT: v_mov_b32_e32 v12, v9
-; VI-NEXT: v_mov_b32_e32 v20, v17
-; VI-NEXT: v_mov_b32_e32 v28, v25
-; VI-NEXT: v_mov_b32_e32 v1, v35
-; VI-NEXT: v_mov_b32_e32 v9, v34
-; VI-NEXT: v_mov_b32_e32 v17, v33
-; VI-NEXT: v_mov_b32_e32 v25, v32
+; VI-NEXT: v_mov_b32_e32 v4, v35
+; VI-NEXT: v_mov_b32_e32 v12, v34
+; VI-NEXT: v_mov_b32_e32 v20, v33
+; VI-NEXT: v_mov_b32_e32 v28, v32
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v16bf16_to_v32i8_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index 48c9b8775a474..cc427eb9326a9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -1210,8 +1210,8 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -1223,19 +1223,21 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB14_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB14_2
; SI-NEXT: .LBB14_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1248,12 +1250,12 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB14_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -1263,9 +1265,9 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB14_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -1386,20 +1388,22 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s18, 0
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
; SI-NEXT: s_cbranch_scc0 .LBB15_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: s_cbranch_execnz .LBB15_3
; SI-NEXT: .LBB15_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: .LBB15_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB15_4:
@@ -1414,13 +1418,13 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB15_4
; VI-NEXT: .LBB15_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s16, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -1430,9 +1434,9 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB15_3:
; VI-NEXT: s_branch .LBB15_2
@@ -3454,8 +3458,8 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -3467,19 +3471,21 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB34_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB34_2
; SI-NEXT: .LBB34_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3492,12 +3498,12 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB34_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -3507,9 +3513,9 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB34_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -3630,20 +3636,22 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s18, 0
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
; SI-NEXT: s_cbranch_scc0 .LBB35_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: s_cbranch_execnz .LBB35_3
; SI-NEXT: .LBB35_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: .LBB35_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB35_4:
@@ -3658,13 +3666,13 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB35_4
; VI-NEXT: .LBB35_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s16, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -3674,9 +3682,9 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB35_3:
; VI-NEXT: s_branch .LBB35_2
@@ -5356,12 +5364,14 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB50_2
; SI-NEXT: .LBB50_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -5390,8 +5400,8 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB50_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -5522,12 +5532,14 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; SI-NEXT: s_cbranch_execnz .LBB51_3
; SI-NEXT: .LBB51_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; SI-NEXT: .LBB51_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB51_4:
@@ -5560,8 +5572,8 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB51_3:
; VI-NEXT: s_branch .LBB51_2
@@ -7024,8 +7036,8 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB62_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -7196,8 +7208,8 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB63_3:
; VI-NEXT: s_branch .LBB63_2
@@ -8411,8 +8423,8 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -8424,19 +8436,21 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB72_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB72_2
; SI-NEXT: .LBB72_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -8449,12 +8463,12 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB72_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -8464,9 +8478,9 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB72_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -8587,20 +8601,22 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s18, 0
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
; SI-NEXT: s_cbranch_scc0 .LBB73_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: s_cbranch_execnz .LBB73_3
; SI-NEXT: .LBB73_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: .LBB73_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB73_4:
@@ -8615,13 +8631,13 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB73_4
; VI-NEXT: .LBB73_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s16, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -8631,9 +8647,9 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB73_3:
; VI-NEXT: s_branch .LBB73_2
@@ -8913,8 +8929,8 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
@@ -8929,21 +8945,25 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB76_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB76_2
; SI-NEXT: .LBB76_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -8972,27 +8992,28 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB76_2
; VI-NEXT: .LBB76_4: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT: v_or_b32_e32 v1, v0, v1
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v3
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -9163,22 +9184,26 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s18, 0
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
; SI-NEXT: s_cbranch_scc0 .LBB77_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
; SI-NEXT: s_cbranch_execnz .LBB77_3
; SI-NEXT: .LBB77_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3
; SI-NEXT: .LBB77_3: ; %end
@@ -9196,44 +9221,45 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
; VI-NEXT: s_cmp_lg_u32 s17, 0
; VI-NEXT: s_cbranch_scc0 .LBB77_3
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s7, s16, 24
-; VI-NEXT: s_lshr_b32 s6, s16, 16
-; VI-NEXT: s_lshr_b32 s8, s16, 8
+; VI-NEXT: s_lshr_b32 s6, s16, 24
+; VI-NEXT: s_lshr_b32 s8, s16, 16
+; VI-NEXT: s_lshr_b32 s7, s16, 8
; VI-NEXT: s_cbranch_execnz .LBB77_4
; VI-NEXT: .LBB77_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s16, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_add_f32_e32 v0, s4, v0
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT: v_or_b32_e32 v1, v0, v1
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v3
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB77_3:
+; VI-NEXT: ; implicit-def: $sgpr7
; VI-NEXT: ; implicit-def: $sgpr8
; VI-NEXT: ; implicit-def: $sgpr6
-; VI-NEXT: ; implicit-def: $sgpr7
; VI-NEXT: s_branch .LBB77_2
; VI-NEXT: .LBB77_4:
-; VI-NEXT: v_mov_b32_e32 v1, s8
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v2bf16_to_v4i8_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index 68312b89142c7..2db07ae9626a8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -88,8 +88,8 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB0_2: ; %end
@@ -309,8 +309,8 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -691,16 +691,16 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB4_2
; SI-NEXT: .LBB4_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -739,8 +739,8 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB4_2: ; %end
@@ -907,16 +907,16 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; SI-NEXT: s_cbranch_execnz .LBB5_3
; SI-NEXT: .LBB5_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: .LBB5_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -959,8 +959,8 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 5aac06a7f3a2b..bc0c0158aff29 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -5095,37 +5095,37 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15
; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17
; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21
; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47
@@ -5135,170 +5135,202 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB22_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16
-; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16
-; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v52
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v36
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v34
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v23
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: .LBB22_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB22_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: .LBB22_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -5322,12 +5354,12 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB22_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -5338,15 +5370,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14
+; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -5356,15 +5388,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13
+; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -5374,15 +5406,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12
+; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -5392,15 +5424,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11
+; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -5410,15 +5442,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10
+; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -5428,15 +5460,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9
+; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -5446,15 +5478,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8
+; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -5464,15 +5496,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7
+; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -5482,15 +5514,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6
+; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -5500,15 +5532,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -5518,15 +5550,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4
+; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -5536,15 +5568,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3
+; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -5554,15 +5586,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -5572,15 +5604,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -5590,15 +5622,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -5608,9 +5640,9 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB22_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -6468,171 +6500,208 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17
; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v17, 1.0, s29
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB23_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v30
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v24
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v40
; SI-NEXT: s_cbranch_execnz .LBB23_3
; SI-NEXT: .LBB23_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: .LBB23_3: ; %end
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB23_4:
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -6654,7 +6723,7 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB23_4
; VI-NEXT: .LBB23_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s31, 16
+; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -6662,7 +6731,7 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s31, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6670,7 +6739,7 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_lshl_b32 s4, s30, 16
+; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; VI-NEXT: v_add_f32_e32 v3, s4, v0
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
@@ -6678,27 +6747,27 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s30, 16
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16
+; VI-NEXT: v_or_b32_sdwa v15, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v14, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s29, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6706,17 +6775,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v13, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s28, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6724,17 +6793,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v12, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s27, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6742,17 +6811,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v11, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s26, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6760,17 +6829,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v10, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s25, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6778,17 +6847,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s24, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6796,17 +6865,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s23, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6814,17 +6883,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s22, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6832,17 +6901,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s21, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6850,17 +6919,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s20, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6868,17 +6937,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -6886,17 +6955,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
@@ -6904,17 +6973,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v16, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc
; VI-NEXT: v_add_f32_e32 v16, s4, v0
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
@@ -6922,15 +6991,15 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v16, s4, v0
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -6940,9 +7009,9 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_branch .LBB23_5
; VI-NEXT: .LBB23_3:
; VI-NEXT: s_branch .LBB23_2
@@ -19915,37 +19984,37 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15
; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17
; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21
; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47
@@ -19955,170 +20024,202 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB46_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16
-; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16
-; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v52
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v36
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v34
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v23
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: .LBB46_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB46_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: .LBB46_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -20142,12 +20243,12 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB46_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -20158,15 +20259,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14
+; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -20176,15 +20277,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13
+; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -20194,15 +20295,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12
+; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -20212,15 +20313,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11
+; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -20230,15 +20331,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10
+; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -20248,15 +20349,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9
+; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -20266,15 +20367,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8
+; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -20284,15 +20385,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7
+; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -20302,15 +20403,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6
+; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -20320,15 +20421,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -20338,15 +20439,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4
+; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -20356,15 +20457,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3
+; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -20374,15 +20475,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -20392,15 +20493,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -20410,15 +20511,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -20428,9 +20529,9 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB46_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -21288,171 +21389,208 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17
; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v17, 1.0, s29
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v30
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v24
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v40
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: .LBB47_3: ; %end
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB47_4:
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -21474,7 +21612,7 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB47_4
; VI-NEXT: .LBB47_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s31, 16
+; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -21482,7 +21620,7 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s31, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21490,7 +21628,7 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_lshl_b32 s4, s30, 16
+; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; VI-NEXT: v_add_f32_e32 v3, s4, v0
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
@@ -21498,27 +21636,27 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s30, 16
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16
+; VI-NEXT: v_or_b32_sdwa v15, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v14, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s29, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21526,17 +21664,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v13, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s28, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21544,17 +21682,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v12, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s27, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21562,17 +21700,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v11, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s26, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21580,17 +21718,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v10, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s25, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21598,17 +21736,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s24, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21616,17 +21754,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s23, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21634,17 +21772,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s22, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21652,17 +21790,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s21, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21670,17 +21808,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s20, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21688,17 +21826,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -21706,17 +21844,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
@@ -21724,17 +21862,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v16, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc
; VI-NEXT: v_add_f32_e32 v16, s4, v0
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
@@ -21742,15 +21880,15 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v16, s4, v0
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -21760,9 +21898,9 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_branch .LBB47_5
; VI-NEXT: .LBB47_3:
; VI-NEXT: s_branch .LBB47_2
@@ -34243,37 +34381,37 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15
; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17
; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21
; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47
@@ -34283,170 +34421,202 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB66_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16
-; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16
-; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v52
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v36
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v34
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v23
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: .LBB66_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB66_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: .LBB66_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -34470,12 +34640,12 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB66_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -34486,15 +34656,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14
+; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -34504,15 +34674,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13
+; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -34522,15 +34692,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12
+; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -34540,15 +34710,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11
+; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -34558,15 +34728,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10
+; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -34576,15 +34746,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9
+; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -34594,15 +34764,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8
+; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -34612,15 +34782,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7
+; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -34630,15 +34800,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6
+; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -34648,15 +34818,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -34666,15 +34836,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4
+; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -34684,15 +34854,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3
+; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -34702,15 +34872,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -34720,15 +34890,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -34738,15 +34908,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -34756,9 +34926,9 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB66_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -35616,171 +35786,208 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17
; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v17, 1.0, s29
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB67_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v30
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v24
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v40
; SI-NEXT: s_cbranch_execnz .LBB67_3
; SI-NEXT: .LBB67_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: .LBB67_3: ; %end
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB67_4:
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -35802,7 +36009,7 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB67_4
; VI-NEXT: .LBB67_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s31, 16
+; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -35810,7 +36017,7 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s31, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -35818,7 +36025,7 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_lshl_b32 s4, s30, 16
+; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; VI-NEXT: v_add_f32_e32 v3, s4, v0
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
@@ -35826,27 +36033,27 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s30, 16
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16
+; VI-NEXT: v_or_b32_sdwa v15, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v14, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s29, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -35854,17 +36061,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v13, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s28, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -35872,17 +36079,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v12, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s27, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -35890,17 +36097,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v11, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s26, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -35908,17 +36115,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v10, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s25, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -35926,17 +36133,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s24, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -35944,17 +36151,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s23, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -35962,17 +36169,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s22, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -35980,17 +36187,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s21, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -35998,17 +36205,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s20, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -36016,17 +36223,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -36034,17 +36241,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
@@ -36052,17 +36259,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v16, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc
; VI-NEXT: v_add_f32_e32 v16, s4, v0
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
@@ -36070,15 +36277,15 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v16, s4, v0
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -36088,9 +36295,9 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_branch .LBB67_5
; VI-NEXT: .LBB67_3:
; VI-NEXT: s_branch .LBB67_2
@@ -47646,37 +47853,37 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5
; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13
; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15
; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17
; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21
; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23
; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25
; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27
; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47
@@ -47686,170 +47893,202 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB82_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16
-; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16
-; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v52
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v36
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v34
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v23
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr43
; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr51
; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr35
; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr33
; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: .LBB82_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB82_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: .LBB82_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -47873,12 +48112,12 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB82_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
@@ -47889,15 +48128,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14
+; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
@@ -47907,15 +48146,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13
+; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
@@ -47925,15 +48164,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12
+; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
@@ -47943,15 +48182,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11
+; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
@@ -47961,15 +48200,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10
+; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
@@ -47979,15 +48218,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9
+; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
@@ -47997,15 +48236,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8
+; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
@@ -48015,15 +48254,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7
+; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
@@ -48033,15 +48272,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6
+; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
@@ -48051,15 +48290,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -48069,15 +48308,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4
+; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -48087,15 +48326,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3
+; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
@@ -48105,15 +48344,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2
+; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -48123,15 +48362,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -48141,15 +48380,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16
-; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -48159,9 +48398,9 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB82_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -49019,171 +49258,208 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v22, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17
; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26
-; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29
+; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27
; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28
+; SI-NEXT: v_mul_f32_e64 v17, 1.0, s29
+; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB83_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
-; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16
-; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16
-; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16
-; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16
-; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16
-; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16
-; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16
-; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16
-; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16
-; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16
-; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16
-; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16
-; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v30
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v24
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v40
; SI-NEXT: s_cbranch_execnz .LBB83_3
; SI-NEXT: .LBB83_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51
-; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
-; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31
; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30
-; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28
-; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26
-; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25
; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24
-; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23
; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20
-; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19
; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18
-; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
; SI-NEXT: .LBB83_3: ; %end
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB83_4:
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -49205,7 +49481,7 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB83_4
; VI-NEXT: .LBB83_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s31, 16
+; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -49213,7 +49489,7 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s31, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s31, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49221,7 +49497,7 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: s_lshl_b32 s4, s30, 16
+; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; VI-NEXT: v_add_f32_e32 v3, s4, v0
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
@@ -49229,27 +49505,27 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: s_and_b32 s4, s30, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s30, 16
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: v_add_f32_e32 v4, s4, v0
; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16
+; VI-NEXT: v_or_b32_sdwa v15, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v14, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s29, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49257,17 +49533,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v13, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s28, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s28, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49275,17 +49551,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v12, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s27, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s27, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49293,17 +49569,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v11, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s26, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s26, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49311,17 +49587,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v10, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s25, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s25, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49329,17 +49605,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s24, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s24, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49347,17 +49623,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s23, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s23, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49365,17 +49641,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s22, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s22, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49383,17 +49659,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s21, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s21, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49401,17 +49677,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s20, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s20, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49419,17 +49695,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s19, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s19, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -49437,17 +49713,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
@@ -49455,17 +49731,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v16, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1
; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc
; VI-NEXT: v_add_f32_e32 v16, s4, v0
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
@@ -49473,15 +49749,15 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v16, s4, v0
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -49491,9 +49767,9 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_branch .LBB83_5
; VI-NEXT: .LBB83_3:
; VI-NEXT: s_branch .LBB83_2
@@ -60284,7 +60560,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0
; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1
@@ -60316,7 +60592,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: v_mul_f32_e32 v52, 1.0, v27
; SI-NEXT: v_mul_f32_e32 v41, 1.0, v28
; SI-NEXT: v_mul_f32_e32 v40, 1.0, v29
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v30
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v30
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
@@ -60351,7 +60627,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -60387,8 +60663,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v52
; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v40
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v55
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v54
; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr62
; SI-NEXT: ; implicit-def: $vgpr33
@@ -60419,116 +60695,132 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr41
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: .LBB94_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB94_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58
-; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v4, v2, v5
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v56
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v8, v2, v9
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v12, v2, v13
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v46
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7
-; SI-NEXT: v_alignbit_b32 v16, v9, v2, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v16, v2, v17
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v44
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9
-; SI-NEXT: v_alignbit_b32 v20, v10, v2, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v20, v2, v21
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT: v_alignbit_b32 v24, v11, v2, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v24, v2, v25
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11
-; SI-NEXT: v_alignbit_b32 v28, v13, v2, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v28, v2, v29
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v11
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52
-; SI-NEXT: v_alignbit_b32 v30, v31, v2, 16
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v30, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16
+; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v26, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10
-; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v48
-; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16
+; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v22, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19
+; SI-NEXT: v_or_b32_e32 v18, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34
-; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v32
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
@@ -60824,38 +61116,38 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v15, v15, v31, 16
-; VI-NEXT: v_alignbit_b32 v14, v14, v30, 16
-; VI-NEXT: v_alignbit_b32 v13, v13, v29, 16
-; VI-NEXT: v_alignbit_b32 v12, v12, v28, 16
-; VI-NEXT: v_alignbit_b32 v11, v11, v27, 16
-; VI-NEXT: v_alignbit_b32 v10, v10, v26, 16
-; VI-NEXT: v_alignbit_b32 v9, v9, v25, 16
-; VI-NEXT: v_alignbit_b32 v8, v8, v24, 16
-; VI-NEXT: v_alignbit_b32 v7, v7, v23, 16
-; VI-NEXT: v_alignbit_b32 v6, v6, v22, 16
-; VI-NEXT: v_alignbit_b32 v5, v5, v21, 16
-; VI-NEXT: v_alignbit_b32 v4, v4, v20, 16
-; VI-NEXT: v_alignbit_b32 v3, v3, v19, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16
-; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v12, v28, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v11, v27, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v10, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v8, v24, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v21, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v20, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB94_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -61793,110 +62085,126 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v54
; SI-NEXT: s_cbranch_execnz .LBB95_3
; SI-NEXT: .LBB95_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60
-; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v4, v2, v5
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v8, v2, v9
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v12, v2, v13
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v46
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7
-; SI-NEXT: v_alignbit_b32 v16, v9, v2, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v16, v2, v17
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v44
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9
-; SI-NEXT: v_alignbit_b32 v20, v10, v2, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v20, v2, v21
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT: v_alignbit_b32 v24, v11, v2, 16
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v24, v2, v25
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11
-; SI-NEXT: v_alignbit_b32 v28, v13, v2, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v28, v2, v29
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v11
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52
-; SI-NEXT: v_alignbit_b32 v30, v31, v2, 16
+; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v30, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11
-; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v50
-; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16
+; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v26, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10
-; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v38
-; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16
+; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v22, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34
-; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16
+; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19
+; SI-NEXT: v_or_b32_e32 v18, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48
-; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15
+; SI-NEXT: v_or_b32_e32 v14, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v10, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v6, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v32
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16
; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16
@@ -62012,19 +62320,19 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_add_f32_e32 v5, s5, v1
; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16
+; VI-NEXT: v_or_b32_sdwa v14, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16
+; VI-NEXT: v_or_b32_sdwa v15, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
@@ -62037,8 +62345,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v13, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -62055,8 +62363,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v12, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -62073,8 +62381,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v11, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -62091,8 +62399,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v10, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -62109,8 +62417,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v9, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -62127,8 +62435,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v8, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -62145,8 +62453,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v7, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -62163,8 +62471,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -62181,8 +62489,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -62199,8 +62507,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v4, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -62217,8 +62525,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc
; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2
@@ -62235,8 +62543,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v16, s4, v1
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
@@ -62260,10 +62568,10 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16
-; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_branch .LBB95_5
; VI-NEXT: .LBB95_3:
; VI-NEXT: s_branch .LBB95_2
@@ -73035,38 +73343,38 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v15, v15, v31, 16
-; VI-NEXT: v_alignbit_b32 v14, v14, v30, 16
-; VI-NEXT: v_alignbit_b32 v13, v13, v29, 16
-; VI-NEXT: v_alignbit_b32 v12, v12, v28, 16
-; VI-NEXT: v_alignbit_b32 v11, v11, v27, 16
-; VI-NEXT: v_alignbit_b32 v10, v10, v26, 16
-; VI-NEXT: v_alignbit_b32 v9, v9, v25, 16
-; VI-NEXT: v_alignbit_b32 v8, v8, v24, 16
-; VI-NEXT: v_alignbit_b32 v7, v7, v23, 16
-; VI-NEXT: v_alignbit_b32 v6, v6, v22, 16
-; VI-NEXT: v_alignbit_b32 v5, v5, v21, 16
-; VI-NEXT: v_alignbit_b32 v4, v4, v20, 16
-; VI-NEXT: v_alignbit_b32 v3, v3, v19, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16
-; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16
+; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v12, v28, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v11, v27, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v10, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v8, v24, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v5, v21, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v20, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB102_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -74251,19 +74559,19 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_add_f32_e32 v5, s5, v1
; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
; VI-NEXT: s_lshl_b32 s4, s29, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16
+; VI-NEXT: v_or_b32_sdwa v14, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16
+; VI-NEXT: v_or_b32_sdwa v15, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: s_and_b32 s4, s29, 0xffff0000
@@ -74276,8 +74584,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s28, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v13, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -74294,8 +74602,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s27, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v12, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -74312,8 +74620,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s26, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v11, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -74330,8 +74638,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s25, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v10, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -74348,8 +74656,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s24, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v9, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -74366,8 +74674,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s23, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v8, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -74384,8 +74692,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s22, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v7, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -74402,8 +74710,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s21, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -74420,8 +74728,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: s_lshl_b32 s4, s20, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -74438,8 +74746,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
; VI-NEXT: s_lshl_b32 s4, s19, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v4, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -74456,8 +74764,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc
; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v1
; VI-NEXT: v_bfe_u32 v16, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2
@@ -74474,8 +74782,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v16, s4, v1
; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
@@ -74499,10 +74807,10 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16
-; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_branch .LBB103_5
; VI-NEXT: .LBB103_3:
; VI-NEXT: s_branch .LBB103_2
@@ -83728,20 +84036,10 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4
-; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32
-; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32
+; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v1
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
@@ -83789,604 +84087,593 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v42, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v23
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v27
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v30
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v51, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v39, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v7
; SI-NEXT: v_mul_f32_e32 v32, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9
+; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10
+; SI-NEXT: v_mul_f32_e32 v49, 1.0, v11
; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v52, 1.0, v15
; SI-NEXT: v_mul_f32_e32 v34, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v44, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v45, 1.0, v17
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v19
-; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22
-; SI-NEXT: v_mul_f32_e32 v47, 1.0, v21
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v55, 1.0, v18
+; SI-NEXT: v_mul_f32_e32 v18, 1.0, v19
+; SI-NEXT: v_mul_f32_e32 v17, 1.0, v20
+; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22
; SI-NEXT: v_mul_f32_e32 v22, 1.0, v24
-; SI-NEXT: v_mul_f32_e32 v24, 1.0, v23
-; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v25
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v27
-; SI-NEXT: v_mul_f32_e32 v27, 1.0, v30
-; SI-NEXT: s_waitcnt expcnt(6)
-; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v26
+; SI-NEXT: v_mul_f32_e32 v20, 1.0, v28
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr19
; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr47
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr63
; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr58
; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr46
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr59
; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; kill: killed $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr30
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48
+; SI-NEXT: v_mul_f32_e32 v28, 1.0, v44
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v50
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v45
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr45
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB108_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v31
-; SI-NEXT: v_alignbit_b32 v48, v1, v38, 16
-; SI-NEXT: v_alignbit_b32 v50, v37, v35, 16
-; SI-NEXT: v_alignbit_b32 v1, v50, v48, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36
+; SI-NEXT: v_or_b32_e32 v16, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31
+; SI-NEXT: v_or_b32_e32 v19, v1, v2
+; SI-NEXT: v_alignbit_b32 v1, v19, v16, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v50, v48, 16
+; SI-NEXT: v_alignbit_b32 v1, v19, v16, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v50, v48, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32
-; SI-NEXT: v_alignbit_b32 v23, v1, v52, 16
-; SI-NEXT: v_alignbit_b32 v21, v19, v49, 16
-; SI-NEXT: v_alignbit_b32 v1, v21, v23, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v21, v23, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v21, v23, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v19, v16, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v33
-; SI-NEXT: v_alignbit_b32 v17, v1, v55, 16
-; SI-NEXT: v_alignbit_b32 v18, v16, v53, 16
-; SI-NEXT: v_alignbit_b32 v1, v18, v17, 24
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39
+; SI-NEXT: v_or_b32_e32 v13, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32
+; SI-NEXT: v_or_b32_e32 v14, v1, v2
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v18, v17, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v18, v17, 8
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v34
-; SI-NEXT: v_alignbit_b32 v14, v1, v42, 16
-; SI-NEXT: v_alignbit_b32 v15, v13, v40, 16
-; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50
+; SI-NEXT: v_or_b32_e32 v11, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33
+; SI-NEXT: v_or_b32_e32 v12, v1, v2
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39
-; SI-NEXT: v_alignbit_b32 v11, v1, v45, 16
-; SI-NEXT: v_alignbit_b32 v12, v10, v43, 16
-; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
+; SI-NEXT: v_or_b32_e32 v9, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34
+; SI-NEXT: v_or_b32_e32 v10, v1, v2
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22
-; SI-NEXT: v_alignbit_b32 v8, v1, v47, 16
-; SI-NEXT: v_alignbit_b32 v9, v7, v24, 16
-; SI-NEXT: v_alignbit_b32 v1, v9, v8, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55
+; SI-NEXT: v_or_b32_e32 v7, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v8, v1, v2
+; SI-NEXT: v_alignbit_b32 v1, v8, v7, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v9, v8, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v8, v7, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v8, v7, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25
-; SI-NEXT: v_alignbit_b32 v5, v1, v56, 16
-; SI-NEXT: v_alignbit_b32 v6, v4, v28, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41
+; SI-NEXT: v_or_b32_e32 v5, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v6, v1, v2
; SI-NEXT: v_alignbit_b32 v1, v6, v5, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v6, v5, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v6, v5, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
-; SI-NEXT: v_alignbit_b32 v2, v1, v57, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30
-; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22
-; SI-NEXT: v_alignbit_b32 v3, v1, v29, 16
-; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v18
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v30
-; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v34
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v15
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24
+; SI-NEXT: v_or_b32_e32 v3, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
+; SI-NEXT: v_alignbit_b32 v1, v4, v3, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v39
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v4, v3, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v4, v3, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v31
-; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50
-; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v32
-; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21
-; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v33
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25
-; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v26
+; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v2, v2, v15
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22
+; SI-NEXT: v_alignbit_b32 v15, v2, v1, 24
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v20
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v15, v2, v1, 16
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v15, v2, v1, 8
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v32
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v26
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v10
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v34
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v26
+; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v19
+; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v14
+; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v12
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v31
+; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v31
+; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v32
+; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v33
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33
+; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v34
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v22
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v8
+; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v6
+; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v4
+; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v2
+; SI-NEXT: ; implicit-def: $vgpr37
; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr38
; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr50
; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr54
; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr52
; SI-NEXT: ; implicit-def: $vgpr34
; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr21
; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr20
; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr26
; SI-NEXT: .LBB108_2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB108_4
; SI-NEXT: ; %bb.3: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v51
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v52
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_alignbit_b32 v23, v20, v19, 16
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v3
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19
-; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v32
-; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36
-; SI-NEXT: v_alignbit_b32 v21, v19, v20, 16
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v38
-; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; SI-NEXT: v_alignbit_b32 v48, v30, v20, 16
-; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31
-; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v54
-; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v55
-; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_alignbit_b32 v50, v37, v20, 16
-; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_alignbit_b32 v20, v50, v48, 24
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56
-; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v41
-; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v50, v48, 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v20
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v20
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v42
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v41
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v42
-; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16
-; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22
+; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v21
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v40
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v55
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v18
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v17
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v50, v48, 8
-; SI-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v9
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v17
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v53
+; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v52
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v34
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v51
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v50
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v49
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v13
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v39
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v15
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v16
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v21, v23, 24
-; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44
-; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v40
-; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v21, v23, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v45
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v34
-; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v37
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v36
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v16, v15, v16
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v35
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v31
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v19
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v24
+; SI-NEXT: v_or_b32_e32 v19, v15, v19
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v21, v23, 8
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46
-; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v13
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v18, v17, 24
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39
-; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v18, v17, 8
-; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v24
-; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v10
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v15, v14, 24
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v24
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v21
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v15, v14, 16
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7
-; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22
-; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v15, v14, 8
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v7
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v12, v11, 24
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v19, v16, 24
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v12, v11, 16
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v4
-; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25
-; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v19, v16, 16
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v12, v11, 8
-; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v4
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v19, v16, 8
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v9, v8, 24
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v14, v13, 24
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v9, v8, 16
-; SI-NEXT: v_alignbit_b32 v6, v4, v6, 16
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v14, v13, 16
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v9, v8, 8
-; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v1
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v14, v13, 8
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v12, v11, 24
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v6, v5, 16
-; SI-NEXT: v_alignbit_b32 v3, v1, v3, 16
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v12, v11, 16
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v6, v5, 8
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v12, v11, 8
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v10, v9, 24
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16
-; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v10, v9, 16
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v10, v9, 8
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v27
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v8, v7, 24
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v18
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v8, v7, 8
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v15
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v6, v5, 24
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6
-; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50
-; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v30
-; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29
-; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v28
-; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v26
-; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24
-; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25
-; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3
-; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v15, v6, v5, 16
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v15, v6, v5, 8
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v15, v4, v3, 24
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v15, v4, v3, 16
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v15, v4, v3, 8
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v15, v2, v1, 24
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v15, v2, v1, 16
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v15, v2, v1, 8
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v10
+; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v17
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v20
+; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24
+; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v19
+; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v14
+; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v12
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v24
+; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v23
+; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v22
+; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v18
+; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v21
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v26
+; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v8
+; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v6
+; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v4
+; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v2
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: .LBB108_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v48
-; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20
-; SI-NEXT: v_and_b32_e32 v19, 0xff, v19
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15
; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24
-; SI-NEXT: v_or_b32_e32 v22, v22, v24
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_or_b32_e32 v24, v25, v24
-; SI-NEXT: v_or_b32_e32 v22, v22, v24
-; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v50
-; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v63
-; SI-NEXT: v_or_b32_e32 v22, v22, v24
-; SI-NEXT: v_and_b32_e32 v24, 0xff, v37
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT: v_or_b32_e32 v20, v20, v24
-; SI-NEXT: v_or_b32_e32 v20, v22, v20
-; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v20, 0xff, v23
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
-; SI-NEXT: v_or_b32_e32 v20, v20, v22
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_or_b32_e32 v22, v23, v22
-; SI-NEXT: v_or_b32_e32 v20, v20, v22
-; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v20, 0xff, v21
-; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v59
-; SI-NEXT: v_or_b32_e32 v20, v20, v21
-; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62
-; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT: v_or_b32_e32 v19, v21, v19
-; SI-NEXT: v_or_b32_e32 v19, v20, v19
-; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19
-; SI-NEXT: v_or_b32_e32 v17, v17, v19
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20
+; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v19, 0xff, v19
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_or_b32_e32 v19, v20, v19
-; SI-NEXT: v_or_b32_e32 v17, v17, v19
-; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_or_b32_e32 v17, v18, v17
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v17, 0xff, v18
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; SI-NEXT: v_or_b32_e32 v17, v17, v18
-; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v60
-; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT: v_or_b32_e32 v16, v18, v16
-; SI-NEXT: v_or_b32_e32 v16, v17, v16
-; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0
-; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v16, 0xff, v19
+; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v60
+; SI-NEXT: v_or_b32_e32 v16, v16, v17
+; SI-NEXT: v_and_b32_e32 v17, 0xff, v47
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v17
+; SI-NEXT: v_or_b32_e32 v15, v16, v15
+; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v16
-; SI-NEXT: v_or_b32_e32 v14, v14, v16
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15
+; SI-NEXT: v_or_b32_e32 v13, v13, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT: v_or_b32_e32 v16, v17, v16
-; SI-NEXT: v_or_b32_e32 v14, v14, v16
-; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v16, v15
+; SI-NEXT: v_or_b32_e32 v13, v13, v15
+; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v14, 0xff, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15
-; SI-NEXT: v_or_b32_e32 v14, v14, v15
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT: v_and_b32_e32 v13, 0xff, v14
+; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v56
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v63
+; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15
-; SI-NEXT: v_or_b32_e32 v13, v15, v13
-; SI-NEXT: v_or_b32_e32 v13, v14, v13
-; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0
+; SI-NEXT: v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v15, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0
; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
; SI-NEXT: v_or_b32_e32 v11, v11, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
@@ -84395,58 +84682,91 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_or_b32_e32 v13, v14, v13
; SI-NEXT: v_or_b32_e32 v11, v11, v13
-; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0
+; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v0
; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v11, 0xff, v12
-; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v61
+; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v44
; SI-NEXT: v_or_b32_e32 v11, v11, v12
-; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v30
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v61
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
-; SI-NEXT: v_or_b32_e32 v10, v12, v10
-; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0
-; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v12, v13, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10
-; SI-NEXT: v_or_b32_e32 v8, v8, v10
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; SI-NEXT: v_or_b32_e32 v9, v9, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v12, v11
+; SI-NEXT: v_or_b32_e32 v9, v9, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v58
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_or_b32_e32 v8, v8, v10
-; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0
+; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v8, 0xff, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT: v_or_b32_e32 v8, v8, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT: v_or_b32_e32 v7, v9, v7
-; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v10, v9
+; SI-NEXT: v_or_b32_e32 v7, v7, v9
+; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0
+; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v62
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v46
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v9, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0
; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
; SI-NEXT: v_or_b32_e32 v5, v5, v7
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8
@@ -84455,52 +84775,89 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
; SI-NEXT: v_or_b32_e32 v5, v5, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0
+; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0
; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v5, 0xff, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v59
; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v58
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v43
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT: v_or_b32_e32 v4, v6, v4
-; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0
-; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v7, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0
+; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v6, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v4
+; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v57
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0
-; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT: v_or_b32_e32 v1, v3, v1
-; SI-NEXT: v_or_b32_e32 v1, v2, v1
-; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v4, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v45
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
@@ -84526,6 +84883,10 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; kill: killed $vgpr17
; VI-NEXT: ; implicit-def: $vgpr17
+; VI-NEXT: ; kill: killed $vgpr17
+; VI-NEXT: ; implicit-def: $vgpr17
+; VI-NEXT: ; kill: killed $vgpr17
+; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -84542,49 +84903,47 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr29
-; VI-NEXT: ; implicit-def: $vgpr27
-; VI-NEXT: ; implicit-def: $vgpr22
-; VI-NEXT: ; implicit-def: $vgpr28
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr24
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr21
+; VI-NEXT: ; implicit-def: $vgpr44
; VI-NEXT: ; implicit-def: $vgpr63
; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr21
+; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr53
; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr50
; VI-NEXT: ; implicit-def: $vgpr46
; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr49
; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: ; implicit-def: $vgpr38
; VI-NEXT: ; implicit-def: $vgpr52
; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr34
; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: ; implicit-def: $vgpr32
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; implicit-def: $vgpr30
; VI-NEXT: ; kill: killed $vgpr17
+; VI-NEXT: ; implicit-def: $vgpr29
+; VI-NEXT: ; implicit-def: $vgpr28
; VI-NEXT: ; implicit-def: $vgpr26
-; VI-NEXT: ; implicit-def: $vgpr25
-; VI-NEXT: ; implicit-def: $vgpr24
+; VI-NEXT: ; implicit-def: $vgpr23
; VI-NEXT: ; implicit-def: $vgpr20
; VI-NEXT: ; implicit-def: $vgpr19
; VI-NEXT: ; implicit-def: $vgpr18
@@ -84593,71 +84952,72 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB108_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v15
; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14]
; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
-; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8]
-; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16
; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6]
-; VI-NEXT: v_mov_b32_e32 v26, v22
-; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4]
+; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8]
+; VI-NEXT: v_lshrrev_b32_e32 v28, 24, v16
+; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16
; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15
-; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13
-; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12
-; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12
-; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11
-; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10
-; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9
-; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9
-; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8
-; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8
-; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3
-; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2
-; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1
-; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2]
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12
+; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12
+; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v11
+; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11
+; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10
+; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v10
+; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v9
+; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9
+; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v8
+; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v8
+; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v8
+; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v7
+; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v7
+; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v6
+; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v6
+; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v5
+; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v5
+; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v4
+; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v4
+; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v4
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3
+; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v3
+; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v2
+; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v2
+; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2
+; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v1
+; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[5:6]
+; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4]
+; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; VI-NEXT: .LBB108_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB108_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v36, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v2, 16, 1
; VI-NEXT: s_movk_i32 s6, 0x7fff
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2
@@ -84665,443 +85025,465 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v36
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; VI-NEXT: v_or_b32_e32 v29, v2, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v37, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v37
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; VI-NEXT: v_or_b32_e32 v28, v1, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v34, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v4
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v4
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; VI-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: v_alignbit_b32 v4, v4, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; VI-NEXT: v_or_b32_e32 v31, v4, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v35, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v3
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v35
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_alignbit_b32 v3, v3, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; VI-NEXT: v_or_b32_e32 v30, v3, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v59, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v6, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v6
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v6
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; VI-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v59
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_alignbit_b32 v6, v6, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v5
+; VI-NEXT: v_or_b32_e32 v26, v6, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v61, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v5, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v5
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v5
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v61
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v5, v5, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v8
+; VI-NEXT: v_or_b32_e32 v25, v5, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v46, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v8, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v8
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v8
; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
; VI-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v46
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: v_alignbit_b32 v8, v8, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; VI-NEXT: v_or_b32_e32 v23, v8, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v56, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v7, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v7
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; VI-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v56
; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: v_alignbit_b32 v7, v7, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v10
+; VI-NEXT: v_or_b32_e32 v22, v7, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v10
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v41, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v10, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v10
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v10
; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
; VI-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v41
; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: v_alignbit_b32 v10, v10, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v9
+; VI-NEXT: v_or_b32_e32 v33, v10, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v43, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v9, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v9
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v9
; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
; VI-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v43
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: v_alignbit_b32 v9, v9, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v12
+; VI-NEXT: v_or_b32_e32 v32, v9, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v12
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v52, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v12, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v12
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12
; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
; VI-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v52
; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: v_alignbit_b32 v12, v12, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v11
+; VI-NEXT: v_or_b32_e32 v21, v12, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v11
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v54, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v11, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v11
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11
; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
; VI-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v54
; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: v_alignbit_b32 v11, v11, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v14
+; VI-NEXT: v_or_b32_e32 v20, v11, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v51, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v14, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v14
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v14
; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
; VI-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v51
; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: v_alignbit_b32 v14, v14, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v13
+; VI-NEXT: v_or_b32_e32 v39, v14, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v40, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v13, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13
; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
; VI-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v40
; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: v_alignbit_b32 v13, v13, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; VI-NEXT: v_or_b32_e32 v38, v13, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v45, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v16, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16
; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
; VI-NEXT: v_cndmask_b32_e32 v16, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v45
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: v_alignbit_b32 v16, v16, v17, 16
-; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v15
+; VI-NEXT: v_or_b32_e32 v49, v16, v17
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15
; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
-; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; VI-NEXT: v_cndmask_b32_e32 v58, v18, v19, vcc
; VI-NEXT: v_bfe_u32 v18, v15, 16, 1
; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15
; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18
; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15
; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc
+; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v58
; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16
-; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
-; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14]
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
-; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4]
-; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8]
-; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2]
-; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15
-; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13
-; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12
-; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12
-; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11
-; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10
-; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9
-; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9
-; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8
-; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8
-; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3
-; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2
-; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1
-; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_or_b32_e32 v48, v15, v17
+; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[48:49]
+; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[38:39]
+; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v49
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v48
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v39
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v38
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v20
+; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21]
+; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v21
+; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[32:33]
+; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v23
+; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[22:23]
+; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v22
+; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v26
+; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v25
+; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[25:26]
+; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[30:31]
+; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[28:29]
+; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v33
+; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v32
+; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v31
+; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v30
+; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v29
+; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v28
+; VI-NEXT: v_lshrrev_b32_e32 v28, 24, v45
+; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v45
+; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v58
+; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v51
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v51
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v40
+; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v52
+; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v52
+; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v41
+; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v41
+; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v43
+; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v46
+; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v46
+; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v56
+; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v59
+; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v59
+; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v61
+; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v34
+; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v34
+; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v35
+; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v36
+; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v37
; VI-NEXT: .LBB108_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29
-; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22
-; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v60
+; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v57
+; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v24
+; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62
-; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35
+; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61
-; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v47
+; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v62
; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26
; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v61, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58
; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23
; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45
; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20
; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40
; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19
; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v51
; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18
+; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31
+; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17
-; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0
; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26
+; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v28
+; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
-; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
@@ -86891,596 +87273,651 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17
-; SI-NEXT: v_mul_f32_e32 v42, 1.0, v2
+; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16
; SI-NEXT: v_mul_f32_e32 v20, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v25, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v28, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8
-; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v21, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6
+; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7
+; SI-NEXT: v_mul_f32_e32 v26, 1.0, v8
+; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9
; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10
-; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9
-; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11
-; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14
-; SI-NEXT: v_mul_f32_e32 v33, 1.0, v13
-; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16
-; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15
-; SI-NEXT: v_mul_f32_e32 v48, 1.0, v18
-; SI-NEXT: v_mul_f32_e32 v32, 1.0, v17
-; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v22, 1.0, s19
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21
+; SI-NEXT: v_mul_f32_e32 v31, 1.0, v11
+; SI-NEXT: v_mul_f32_e32 v30, 1.0, v12
+; SI-NEXT: v_mul_f32_e32 v29, 1.0, v13
+; SI-NEXT: v_mul_f32_e32 v54, 1.0, v14
+; SI-NEXT: v_mul_f32_e32 v38, 1.0, v15
+; SI-NEXT: v_mul_f32_e32 v37, 1.0, v16
+; SI-NEXT: v_mul_f32_e32 v34, 1.0, v17
+; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17
+; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v28, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20
-; SI-NEXT: v_mul_f32_e64 v41, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24
-; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26
+; SI-NEXT: v_mul_f32_e64 v22, 1.0, s27
+; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29
-; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB109_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; SI-NEXT: v_alignbit_b32 v27, v1, v3, 16
-; SI-NEXT: v_alignbit_b32 v30, v24, v2, 16
-; SI-NEXT: v_alignbit_b32 v1, v30, v27, 24
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v15, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28
+; SI-NEXT: v_or_b32_e32 v16, v1, v2
+; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v30, v27, 16
+; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v30, v27, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v13, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35
+; SI-NEXT: v_or_b32_e32 v14, v1, v2
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41
-; SI-NEXT: v_alignbit_b32 v21, v1, v6, 16
-; SI-NEXT: v_alignbit_b32 v19, v17, v4, 16
-; SI-NEXT: v_alignbit_b32 v1, v19, v21, 24
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v19, v21, 16
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v19, v21, 8
+; SI-NEXT: v_alignbit_b32 v1, v14, v13, 8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26
-; SI-NEXT: v_alignbit_b32 v15, v1, v9, 16
-; SI-NEXT: v_alignbit_b32 v16, v13, v7, 16
-; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v11, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22
+; SI-NEXT: v_or_b32_e32 v12, v1, v2
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v42
-; SI-NEXT: v_alignbit_b32 v10, v1, v11, 16
-; SI-NEXT: v_alignbit_b32 v11, v9, v20, 16
-; SI-NEXT: v_alignbit_b32 v1, v11, v10, 24
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v9, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48
+; SI-NEXT: v_or_b32_e32 v10, v1, v2
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 24
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v1, v11, v10, 8
+; SI-NEXT: v_alignbit_b32 v1, v10, v9, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23
+; SI-NEXT: v_or_b32_e32 v7, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40
+; SI-NEXT: v_or_b32_e32 v8, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v26
+; SI-NEXT: v_or_b32_e32 v5, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25
-; SI-NEXT: v_alignbit_b32 v6, v1, v28, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44
+; SI-NEXT: v_or_b32_e32 v6, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31
-; SI-NEXT: v_alignbit_b32 v3, v1, v34, 16
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56
-; SI-NEXT: v_alignbit_b32 v2, v1, v35, 16
-; SI-NEXT: v_alignbit_b32 v8, v7, v33, 16
-; SI-NEXT: v_alignbit_b32 v4, v8, v2, 24
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48
-; SI-NEXT: v_alignbit_b32 v1, v1, v39, 16
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44
-; SI-NEXT: v_alignbit_b32 v5, v4, v32, 16
-; SI-NEXT: v_mov_b32_e32 v31, v23
-; SI-NEXT: v_alignbit_b32 v20, v18, v23, 16
-; SI-NEXT: v_alignbit_b32 v14, v12, v29, 16
-; SI-NEXT: v_alignbit_b32 v23, v5, v1, 24
-; SI-NEXT: v_mov_b32_e32 v38, v36
-; SI-NEXT: v_alignbit_b32 v36, v20, v6, 24
-; SI-NEXT: v_alignbit_b32 v25, v14, v3, 24
-; SI-NEXT: v_alignbit_b32 v50, v8, v2, 16
-; SI-NEXT: v_mov_b32_e32 v53, v32
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v23, v5, v1, 16
-; SI-NEXT: v_alignbit_b32 v32, v5, v1, 8
-; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16
-; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8
-; SI-NEXT: v_mov_b32_e32 v35, v29
-; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16
-; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8
-; SI-NEXT: v_mov_b32_e32 v37, v33
-; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8
-; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22
-; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v23, v41
-; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41
-; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19
-; SI-NEXT: v_mov_b32_e32 v28, v26
-; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26
-; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16
-; SI-NEXT: v_mov_b32_e32 v26, v42
-; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42
-; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11
-; SI-NEXT: v_mov_b32_e32 v29, v43
-; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v43
-; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20
-; SI-NEXT: v_mov_b32_e32 v34, v44
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30
+; SI-NEXT: v_or_b32_e32 v3, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34
+; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v36
+; SI-NEXT: v_or_b32_e32 v2, v2, v17
+; SI-NEXT: v_alignbit_b32 v17, v10, v9, 8
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v17, v4, v3, 24
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v17, v4, v3, 16
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v17, v2, v1, 24
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_alignbit_b32 v17, v2, v1, 8
+; SI-NEXT: v_mov_b32_e32 v33, v31
+; SI-NEXT: v_mov_b32_e32 v32, v30
+; SI-NEXT: v_mov_b32_e32 v31, v29
+; SI-NEXT: v_mov_b32_e32 v39, v38
+; SI-NEXT: v_mov_b32_e32 v38, v37
+; SI-NEXT: v_mov_b32_e32 v18, v34
+; SI-NEXT: v_alignbit_b32 v51, v8, v7, 24
+; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16
+; SI-NEXT: v_alignbit_b32 v55, v8, v7, 8
+; SI-NEXT: v_alignbit_b32 v34, v6, v5, 24
+; SI-NEXT: v_alignbit_b32 v26, v6, v5, 16
+; SI-NEXT: v_alignbit_b32 v49, v6, v5, 8
+; SI-NEXT: v_alignbit_b32 v27, v4, v3, 8
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v16
+; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v14
+; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12
+; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v10
+; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v8
+; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v6
+; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v4
+; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v2
+; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v28
+; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v35
+; SI-NEXT: v_mov_b32_e32 v21, v35
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v35
+; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v22
+; SI-NEXT: v_mov_b32_e32 v23, v22
+; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22
+; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v48
+; SI-NEXT: v_mov_b32_e32 v22, v48
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v48
+; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v40
+; SI-NEXT: v_mov_b32_e32 v24, v40
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v40
; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44
-; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14
-; SI-NEXT: v_mov_b32_e32 v33, v56
-; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v56
-; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v48
-; SI-NEXT: v_mov_b32_e32 v48, v32
-; SI-NEXT: v_mov_b32_e32 v32, v50
-; SI-NEXT: v_mov_b32_e32 v50, v25
-; SI-NEXT: v_mov_b32_e32 v25, v36
-; SI-NEXT: v_mov_b32_e32 v36, v38
-; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5
+; SI-NEXT: v_mov_b32_e32 v25, v44
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v44
+; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v54
+; SI-NEXT: v_mov_b32_e32 v30, v54
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v54
+; SI-NEXT: v_lshrrev_b32_e32 v44, 24, v36
+; SI-NEXT: v_mov_b32_e32 v37, v36
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v36
+; SI-NEXT: v_mov_b32_e32 v36, v20
; SI-NEXT: s_cbranch_execnz .LBB109_3
; SI-NEXT: .LBB109_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v38
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53
-; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42
-; SI-NEXT: v_alignbit_b32 v5, v4, v2, 16
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v33
-; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34
-; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44
-; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29
-; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45
-; SI-NEXT: v_alignbit_b32 v48, v5, v1, 8
-; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43
-; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v32
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_alignbit_b32 v8, v7, v3, 16
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v32, v8, v2, 16
-; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v31
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30
+; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v5
+; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25
+; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v54
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24
+; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v43
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v22
+; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v11
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23
+; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v13
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v21
+; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v15
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17
+; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17
+; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v44
+; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v41
+; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v43
+; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v54
+; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v52
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18
+; SI-NEXT: v_alignbit_b32 v27, v4, v3, 8
+; SI-NEXT: v_alignbit_b32 v36, v2, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v17
+; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v44
+; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41
+; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v43
+; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v54
+; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v52
+; SI-NEXT: v_lshrrev_b32_e32 v44, 24, v18
+; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v4
+; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v2
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v21, v19, v17, 16
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16
-; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35
-; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v50, v14, v3, 24
-; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16
-; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8
-; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v17
-; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v28
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v17
-; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13
-; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41
-; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56
-; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41
-; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56
-; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8
-; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16
-; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16
-; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19
-; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
+; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19
+; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v19
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31
-; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT: v_alignbit_b32 v25, v20, v6, 24
-; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16
-; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v34, v6, v5, 24
+; SI-NEXT: v_alignbit_b32 v26, v6, v5, 16
+; SI-NEXT: v_alignbit_b32 v49, v6, v5, 8
+; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v6
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_or_b32_e32 v8, v8, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v51, v8, v7, 24
+; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16
+; SI-NEXT: v_alignbit_b32 v55, v8, v7, 8
+; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v8
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_or_b32_e32 v10, v10, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: v_alignbit_b32 v22, v10, v9, 24
+; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v10
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v23
-; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v59
-; SI-NEXT: v_alignbit_b32 v30, v24, v22, 16
-; SI-NEXT: v_alignbit_b32 v22, v30, v27, 24
-; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v30, v27, 16
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_or_b32_e32 v12, v12, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v14, v14, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v14
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_or_b32_e32 v16, v16, v20
+; SI-NEXT: v_alignbit_b32 v20, v16, v15, 24
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v30, v27, 8
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v19, v21, 24
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v16, v15, 8
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v19, v21, 16
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v19, v21, 8
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v14, v13, 16
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v16, v15, 24
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v14, v13, 8
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v12, v11, 24
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v16, v15, 8
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9
-; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v26
-; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v9
-; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47
-; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16
+; SI-NEXT: v_alignbit_b32 v20, v12, v11, 16
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24
-; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11
-; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47
-; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v12, v11, 8
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v10, v9, 16
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v10, v9, 8
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v8, v2, 24
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v4, v3, 24
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v5, v1, 24
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v4, v3, 16
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v22, v5, v1, 16
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT: v_alignbit_b32 v20, v2, v1, 24
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v59
-; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v45
-; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44
-; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5
+; SI-NEXT: v_alignbit_b32 v20, v2, v1, 8
+; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v16
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; SI-NEXT: .LBB109_3: ; %end
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v27, 0xff, v27
-; SI-NEXT: v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22
-; SI-NEXT: v_or_b32_e32 v22, v22, v24
-; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0
-; SI-NEXT: v_and_b32_e32 v21, 0xff, v21
-; SI-NEXT: v_and_b32_e32 v19, 0xff, v19
-; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
; SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v23
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v27, v27, v36
-; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v35, 0xff, v23
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v23
-; SI-NEXT: v_or_b32_e32 v33, v33, v35
-; SI-NEXT: v_or_b32_e32 v27, v27, v33
-; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v27, 0xff, v30
-; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v62
-; SI-NEXT: v_or_b32_e32 v27, v27, v30
-; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT: v_or_b32_e32 v22, v27, v22
-; SI-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22
-; SI-NEXT: v_or_b32_e32 v21, v21, v22
-; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: v_or_b32_e32 v22, v24, v22
-; SI-NEXT: v_or_b32_e32 v21, v21, v22
-; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v46
-; SI-NEXT: v_or_b32_e32 v19, v19, v21
-; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v41
-; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT: v_or_b32_e32 v17, v21, v17
-; SI-NEXT: v_or_b32_e32 v17, v19, v17
-; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; SI-NEXT: v_or_b32_e32 v15, v15, v17
-; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
+; SI-NEXT: v_or_b32_e32 v15, v15, v18
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19
+; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_or_b32_e32 v17, v19, v17
-; SI-NEXT: v_or_b32_e32 v15, v15, v17
-; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0
-; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v18, 0xff, v18
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v23, v18
+; SI-NEXT: v_or_b32_e32 v15, v15, v18
+; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v15, 0xff, v16
-; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v57
+; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v50
; SI-NEXT: v_or_b32_e32 v15, v15, v16
-; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v61
+; SI-NEXT: v_and_b32_e32 v16, 0xff, v62
+; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v19
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT: v_or_b32_e32 v13, v16, v13
-; SI-NEXT: v_or_b32_e32 v13, v15, v13
-; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0
+; SI-NEXT: v_or_b32_e32 v16, v18, v16
+; SI-NEXT: v_or_b32_e32 v15, v15, v16
+; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15
+; SI-NEXT: v_or_b32_e32 v13, v13, v15
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v15, 0xff, v15
+; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT: v_or_b32_e32 v15, v16, v15
+; SI-NEXT: v_or_b32_e32 v13, v13, v15
+; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0
; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v13, 0xff, v14
+; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v47
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_and_b32_e32 v14, 0xff, v63
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17
+; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT: v_or_b32_e32 v14, v15, v14
+; SI-NEXT: v_or_b32_e32 v13, v13, v14
+; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; SI-NEXT: v_or_b32_e32 v10, v10, v13
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT: v_or_b32_e32 v11, v11, v13
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v13, 0xff, v13
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_or_b32_e32 v13, v15, v13
-; SI-NEXT: v_or_b32_e32 v10, v10, v13
-; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0
-; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v13, v14, v13
+; SI-NEXT: v_or_b32_e32 v11, v11, v13
+; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v0
+; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v10, 0xff, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v58
-; SI-NEXT: v_or_b32_e32 v10, v10, v11
-; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v63
-; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT: v_or_b32_e32 v9, v11, v9
-; SI-NEXT: v_or_b32_e32 v9, v10, v9
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v56
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_and_b32_e32 v12, 0xff, v58
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v61
+; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT: v_or_b32_e32 v12, v13, v12
+; SI-NEXT: v_or_b32_e32 v11, v11, v12
+; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0
+; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; SI-NEXT: v_or_b32_e32 v9, v9, v11
+; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT: v_or_b32_e32 v11, v12, v11
+; SI-NEXT: v_or_b32_e32 v9, v9, v11
+; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0
+; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v10
+; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v29
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
+; SI-NEXT: v_and_b32_e32 v10, 0xff, v42
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v41
+; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT: v_or_b32_e32 v10, v11, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0
; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v40
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v55
+; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v55
+; SI-NEXT: v_or_b32_e32 v7, v7, v9
+; SI-NEXT: v_and_b32_e32 v9, 0xff, v53
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v51
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v9, v10, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
+; SI-NEXT: v_or_b32_e32 v7, v7, v9
; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v20
-; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v47
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_and_b32_e32 v9, 0xff, v18
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v59
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT: v_or_b32_e32 v9, v10, v9
-; SI-NEXT: v_or_b32_e32 v6, v6, v9
-; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0
-; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v8
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v52
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xff, v40
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v57
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: v_or_b32_e32 v8, v9, v8
+; SI-NEXT: v_or_b32_e32 v7, v7, v8
+; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0
+; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v52
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v50
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: v_or_b32_e32 v6, v9, v6
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0
-; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v49
+; SI-NEXT: v_or_b32_e32 v5, v5, v7
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v26
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v34
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT: v_or_b32_e32 v7, v8, v7
+; SI-NEXT: v_or_b32_e32 v5, v5, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0
+; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v60
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: v_and_b32_e32 v6, 0xff, v12
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v46
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v48
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: v_or_b32_e32 v6, v9, v6
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
+; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v45
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT: v_or_b32_e32 v6, v7, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0
-; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v32
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v27
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6
-; SI-NEXT: v_or_b32_e32 v3, v6, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v5, v6, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0
+; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v8
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v56
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xff, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v43
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v3, v6, v3
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v4
+; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v59
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v35
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v60
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v4, v5, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0
+; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xff, v36
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
-; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v3, v4, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xff, v5
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43
; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xff, v4
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v28
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42
+; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v44
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
@@ -87507,813 +87944,866 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: .LBB109_4:
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: v_mov_b32_e32 v53, v32
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: v_mov_b32_e32 v39, v38
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v49, v48
+; SI-NEXT: v_mov_b32_e32 v38, v37
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v37, v33
+; SI-NEXT: v_mov_b32_e32 v37, v36
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v33, v56
+; SI-NEXT: v_mov_b32_e32 v18, v34
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v35, v29
+; SI-NEXT: v_mov_b32_e32 v33, v31
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v34, v44
+; SI-NEXT: v_mov_b32_e32 v32, v30
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v31, v23
+; SI-NEXT: v_mov_b32_e32 v31, v29
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v29, v43
+; SI-NEXT: v_mov_b32_e32 v30, v54
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v28, v26
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v25, v44
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v26, v42
+; SI-NEXT: v_mov_b32_e32 v24, v40
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v23, v41
+; SI-NEXT: v_mov_b32_e32 v23, v22
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: v_mov_b32_e32 v22, v48
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr62
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: v_mov_b32_e32 v21, v35
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr15
; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr62
+; SI-NEXT: ; implicit-def: $vgpr19
; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr61
-; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr63
+; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr56
; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr61
; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr63
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr53
; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; kill: killed $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr57
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; kill: killed $vgpr1
; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr60
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; kill: killed $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr44
; SI-NEXT: s_branch .LBB109_2
;
; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v63, s30, 0
-; VI-NEXT: v_writelane_b32 v63, s31, 1
-; VI-NEXT: v_writelane_b32 v63, s34, 2
-; VI-NEXT: v_writelane_b32 v63, s35, 3
-; VI-NEXT: v_writelane_b32 v63, s36, 4
-; VI-NEXT: v_writelane_b32 v63, s37, 5
-; VI-NEXT: v_writelane_b32 v63, s38, 6
-; VI-NEXT: v_writelane_b32 v63, s39, 7
-; VI-NEXT: v_writelane_b32 v63, s48, 8
-; VI-NEXT: v_writelane_b32 v63, s49, 9
-; VI-NEXT: v_writelane_b32 v63, s50, 10
-; VI-NEXT: v_writelane_b32 v63, s51, 11
-; VI-NEXT: v_writelane_b32 v63, s52, 12
-; VI-NEXT: v_writelane_b32 v63, s53, 13
-; VI-NEXT: v_writelane_b32 v63, s54, 14
-; VI-NEXT: v_writelane_b32 v63, s55, 15
-; VI-NEXT: v_writelane_b32 v63, s64, 16
-; VI-NEXT: v_writelane_b32 v63, s65, 17
-; VI-NEXT: v_writelane_b32 v63, s66, 18
+; VI-NEXT: v_writelane_b32 v4, s30, 0
+; VI-NEXT: v_writelane_b32 v4, s31, 1
+; VI-NEXT: v_writelane_b32 v4, s34, 2
+; VI-NEXT: v_writelane_b32 v4, s35, 3
+; VI-NEXT: v_writelane_b32 v4, s36, 4
+; VI-NEXT: v_writelane_b32 v4, s37, 5
+; VI-NEXT: v_writelane_b32 v4, s38, 6
+; VI-NEXT: v_writelane_b32 v4, s39, 7
+; VI-NEXT: v_writelane_b32 v4, s48, 8
+; VI-NEXT: v_writelane_b32 v4, s49, 9
+; VI-NEXT: v_writelane_b32 v4, s50, 10
+; VI-NEXT: v_writelane_b32 v4, s51, 11
+; VI-NEXT: v_writelane_b32 v4, s52, 12
+; VI-NEXT: v_writelane_b32 v4, s53, 13
+; VI-NEXT: v_writelane_b32 v4, s54, 14
+; VI-NEXT: v_writelane_b32 v4, s55, 15
+; VI-NEXT: v_writelane_b32 v4, s64, 16
+; VI-NEXT: v_writelane_b32 v4, s65, 17
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_writelane_b32 v63, s67, 19
+; VI-NEXT: v_writelane_b32 v4, s66, 18
; VI-NEXT: v_readfirstlane_b32 s4, v1
; VI-NEXT: s_and_b64 s[6:7], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v2
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: s_cbranch_scc0 .LBB109_3
+; VI-NEXT: v_writelane_b32 v4, s67, 19
+; VI-NEXT: s_cbranch_scc0 .LBB109_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b32 s56, s5, 24
; VI-NEXT: s_lshr_b32 s57, s5, 16
-; VI-NEXT: s_lshr_b32 s59, s5, 8
+; VI-NEXT: s_lshr_b32 s76, s5, 8
; VI-NEXT: s_lshr_b32 s58, s4, 16
-; VI-NEXT: s_lshr_b32 s60, s4, 8
-; VI-NEXT: s_lshr_b32 s61, s29, 24
-; VI-NEXT: s_lshr_b32 s62, s29, 16
-; VI-NEXT: s_lshr_b32 s72, s29, 8
-; VI-NEXT: s_lshr_b32 s63, s28, 16
-; VI-NEXT: s_lshr_b32 s73, s28, 8
-; VI-NEXT: s_lshr_b32 s74, s27, 24
-; VI-NEXT: s_lshr_b32 s75, s27, 16
-; VI-NEXT: s_lshr_b32 s77, s27, 8
-; VI-NEXT: s_lshr_b32 s76, s26, 16
-; VI-NEXT: s_lshr_b32 s78, s26, 8
-; VI-NEXT: s_lshr_b32 s79, s25, 24
-; VI-NEXT: s_lshr_b32 s88, s25, 16
-; VI-NEXT: s_lshr_b32 s90, s25, 8
-; VI-NEXT: s_lshr_b32 s89, s24, 16
-; VI-NEXT: s_lshr_b32 s91, s24, 8
-; VI-NEXT: s_lshr_b32 s30, s23, 24
-; VI-NEXT: s_lshr_b32 s31, s23, 16
-; VI-NEXT: s_lshr_b32 s35, s23, 8
-; VI-NEXT: s_lshr_b32 s34, s22, 16
-; VI-NEXT: s_lshr_b32 s36, s22, 8
-; VI-NEXT: s_lshr_b32 s37, s21, 24
-; VI-NEXT: s_lshr_b32 s38, s21, 16
-; VI-NEXT: s_lshr_b32 s48, s21, 8
-; VI-NEXT: s_lshr_b32 s39, s20, 16
-; VI-NEXT: s_lshr_b32 s49, s20, 8
-; VI-NEXT: s_lshr_b32 s50, s19, 24
-; VI-NEXT: s_lshr_b32 s51, s19, 16
-; VI-NEXT: s_lshr_b32 s53, s19, 8
-; VI-NEXT: s_lshr_b32 s52, s18, 16
-; VI-NEXT: s_lshr_b32 s54, s18, 8
-; VI-NEXT: s_lshr_b32 s55, s17, 24
-; VI-NEXT: s_lshr_b32 s64, s17, 16
-; VI-NEXT: s_lshr_b32 s66, s17, 8
-; VI-NEXT: s_lshr_b32 s65, s16, 16
-; VI-NEXT: s_lshr_b32 s67, s16, 8
-; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24
-; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24
-; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24
-; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24
-; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24
-; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24
-; VI-NEXT: s_cbranch_execnz .LBB109_4
+; VI-NEXT: s_lshr_b32 s77, s4, 8
+; VI-NEXT: s_lshr_b32 s59, s29, 24
+; VI-NEXT: s_lshr_b32 s60, s29, 16
+; VI-NEXT: s_lshr_b32 s75, s29, 8
+; VI-NEXT: s_lshr_b32 s61, s28, 16
+; VI-NEXT: s_lshr_b32 s74, s28, 8
+; VI-NEXT: s_lshr_b32 s62, s27, 24
+; VI-NEXT: s_lshr_b32 s78, s27, 16
+; VI-NEXT: s_lshr_b32 s73, s27, 8
+; VI-NEXT: s_lshr_b32 s88, s26, 16
+; VI-NEXT: s_lshr_b32 s72, s26, 8
+; VI-NEXT: s_lshr_b32 s90, s25, 24
+; VI-NEXT: s_lshr_b32 s91, s25, 16
+; VI-NEXT: s_lshr_b32 s63, s25, 8
+; VI-NEXT: s_lshr_b32 s31, s24, 16
+; VI-NEXT: s_lshr_b32 s79, s24, 8
+; VI-NEXT: s_lshr_b32 s35, s23, 24
+; VI-NEXT: s_lshr_b32 s37, s23, 16
+; VI-NEXT: s_lshr_b32 s89, s23, 8
+; VI-NEXT: s_lshr_b32 s39, s22, 16
+; VI-NEXT: s_lshr_b32 s30, s22, 8
+; VI-NEXT: s_lshr_b32 s49, s21, 24
+; VI-NEXT: s_lshr_b32 s50, s21, 16
+; VI-NEXT: s_lshr_b32 s34, s21, 8
+; VI-NEXT: s_lshr_b32 s52, s20, 16
+; VI-NEXT: s_lshr_b32 s36, s20, 8
+; VI-NEXT: s_lshr_b32 s54, s19, 24
+; VI-NEXT: s_lshr_b32 s55, s19, 16
+; VI-NEXT: s_lshr_b32 s38, s19, 8
+; VI-NEXT: s_lshr_b32 s64, s18, 16
+; VI-NEXT: s_lshr_b32 s48, s18, 8
+; VI-NEXT: s_lshr_b32 s66, s17, 24
+; VI-NEXT: s_lshr_b32 s65, s17, 16
+; VI-NEXT: s_lshr_b32 s51, s17, 8
+; VI-NEXT: s_lshr_b32 s67, s16, 16
+; VI-NEXT: s_lshr_b32 s53, s16, 8
+; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24
+; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24
+; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24
+; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
+; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24
+; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24
+; VI-NEXT: s_cbranch_execnz .LBB109_3
; VI-NEXT: .LBB109_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s6, s17, 16
-; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v1, s6, v15
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: s_and_b32 s6, s17, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s6, v15
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s6, s16, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s6, v15
-; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s65, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s17, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s65, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s17, s6, 16
; VI-NEXT: s_and_b32 s6, s16, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: v_add_f32_e32 v3, s6, v15
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: s_lshl_b32 s6, s19, 16
-; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; VI-NEXT: v_add_f32_e32 v3, s6, v15
-; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s47, s17, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s67, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s16, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s67, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s16, s6, 16
; VI-NEXT: s_and_b32 s6, s19, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_add_f32_e32 v4, s6, v15
-; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
-; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; VI-NEXT: s_lshl_b32 s6, s18, 16
-; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16
-; VI-NEXT: v_add_f32_e32 v3, s6, v15
-; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
-; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s46, s16, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s55, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s19, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s55, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s19, s6, 16
; VI-NEXT: s_and_b32 s6, s18, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-NEXT: v_add_f32_e32 v5, s6, v15
-; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: s_lshl_b32 s6, s21, 16
-; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; VI-NEXT: v_add_f32_e32 v5, s6, v15
-; VI-NEXT: v_bfe_u32 v6, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s57, s19, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s64, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s18, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s64, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s18, s6, 16
; VI-NEXT: s_and_b32 s6, s21, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; VI-NEXT: v_add_f32_e32 v6, s6, v15
-; VI-NEXT: v_bfe_u32 v7, v6, 16, 1
-; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; VI-NEXT: s_lshl_b32 s6, s20, 16
-; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16
-; VI-NEXT: v_add_f32_e32 v5, s6, v15
-; VI-NEXT: v_bfe_u32 v7, v5, 16, 1
-; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5
-; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s56, s18, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s50, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s21, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s50, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s21, s6, 16
; VI-NEXT: s_and_b32 s6, s20, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
-; VI-NEXT: v_add_f32_e32 v7, s6, v15
-; VI-NEXT: v_bfe_u32 v8, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7
-; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; VI-NEXT: s_lshl_b32 s6, s23, 16
-; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16
-; VI-NEXT: v_add_f32_e32 v7, s6, v15
-; VI-NEXT: v_bfe_u32 v8, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7
-; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s59, s21, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s52, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s20, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s52, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s20, s6, 16
; VI-NEXT: s_and_b32 s6, s23, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
-; VI-NEXT: v_add_f32_e32 v8, s6, v15
-; VI-NEXT: v_bfe_u32 v9, v8, 16, 1
-; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8
-; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; VI-NEXT: s_lshl_b32 s6, s22, 16
-; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16
-; VI-NEXT: v_add_f32_e32 v7, s6, v15
-; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
-; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
-; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s58, s20, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s37, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s23, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s37, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s23, s6, 16
; VI-NEXT: s_and_b32 s6, s22, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc
-; VI-NEXT: v_add_f32_e32 v9, s6, v15
-; VI-NEXT: v_bfe_u32 v10, v9, 16, 1
-; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9
-; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
-; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
-; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; VI-NEXT: s_lshl_b32 s6, s25, 16
-; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16
-; VI-NEXT: v_add_f32_e32 v9, s6, v15
-; VI-NEXT: v_bfe_u32 v10, v9, 16, 1
-; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9
-; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
-; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s61, s23, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s39, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s22, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s39, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s22, s6, 16
; VI-NEXT: s_and_b32 s6, s25, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
-; VI-NEXT: v_add_f32_e32 v10, s6, v15
-; VI-NEXT: v_bfe_u32 v11, v10, 16, 1
-; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10
-; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
-; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
-; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; VI-NEXT: s_lshl_b32 s6, s24, 16
-; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16
-; VI-NEXT: v_add_f32_e32 v9, s6, v15
-; VI-NEXT: v_bfe_u32 v11, v9, 16, 1
-; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9
-; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11
-; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s60, s22, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s91, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s25, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s91, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s25, s6, 16
; VI-NEXT: s_and_b32 s6, s24, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc
-; VI-NEXT: v_add_f32_e32 v11, s6, v15
-; VI-NEXT: v_bfe_u32 v12, v11, 16, 1
-; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11
-; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
-; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; VI-NEXT: s_lshl_b32 s6, s27, 16
-; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16
-; VI-NEXT: v_add_f32_e32 v11, s6, v15
-; VI-NEXT: v_bfe_u32 v12, v11, 16, 1
-; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11
-; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12
-; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s63, s25, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s31, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s24, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s31, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s24, s6, 16
; VI-NEXT: s_and_b32 s6, s27, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
-; VI-NEXT: v_add_f32_e32 v12, s6, v15
-; VI-NEXT: v_bfe_u32 v13, v12, 16, 1
-; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12
-; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; VI-NEXT: s_lshl_b32 s6, s26, 16
-; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16
-; VI-NEXT: v_add_f32_e32 v11, s6, v15
-; VI-NEXT: v_bfe_u32 v13, v11, 16, 1
-; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11
-; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13
-; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s62, s24, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s78, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s27, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s78, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s27, s6, 16
; VI-NEXT: s_and_b32 s6, s26, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc
-; VI-NEXT: v_add_f32_e32 v13, s6, v15
-; VI-NEXT: v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
-; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; VI-NEXT: s_lshl_b32 s6, s29, 16
-; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16
-; VI-NEXT: v_add_f32_e32 v13, s6, v15
-; VI-NEXT: v_bfe_u32 v14, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13
-; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14
-; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s73, s27, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s88, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s26, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s88, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s26, s6, 16
; VI-NEXT: s_and_b32 s6, s29, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc
-; VI-NEXT: v_add_f32_e32 v14, s6, v15
-; VI-NEXT: v_bfe_u32 v16, v14, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
-; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; VI-NEXT: s_lshl_b32 s6, s28, 16
-; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16
-; VI-NEXT: v_add_f32_e32 v13, s6, v15
-; VI-NEXT: v_bfe_u32 v16, v13, 16, 1
-; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v13
-; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16
-; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s72, s26, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s90, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s29, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s90, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s29, s6, 16
; VI-NEXT: s_and_b32 s6, s28, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc
-; VI-NEXT: v_add_f32_e32 v16, s6, v15
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; VI-NEXT: s_lshl_b32 s6, s5, 16
-; VI-NEXT: v_alignbit_b32 v13, v16, v13, 16
-; VI-NEXT: v_add_f32_e32 v16, s6, v15
-; VI-NEXT: v_bfe_u32 v17, v16, 16, 1
-; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16
-; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17
-; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
-; VI-NEXT: s_and_b32 s5, s5, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc
-; VI-NEXT: v_add_f32_e32 v17, s5, v15
-; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
-; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; VI-NEXT: s_lshl_b32 s5, s4, 16
-; VI-NEXT: v_alignbit_b32 v16, v17, v16, 16
-; VI-NEXT: v_add_f32_e32 v17, s5, v15
-; VI-NEXT: v_bfe_u32 v18, v17, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17
-; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: s_and_b32 s4, s4, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
-; VI-NEXT: v_add_f32_e32 v15, s4, v15
-; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
-; VI-NEXT: v_bfe_u32 v18, v15, 16, 1
-; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15
-; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18
-; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
-; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16
-; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
-; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
-; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14]
-; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
-; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
-; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
-; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16
-; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14
-; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v14
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13
-; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v12
-; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11
-; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11
-; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10
-; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v10
-; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v9
-; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8
-; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v8
-; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7
-; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7
-; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6
-; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v6
-; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v6
-; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v5
-; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v5
-; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4
-; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4
-; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v4
-; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3
-; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2
-; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1
-; VI-NEXT: s_branch .LBB109_5
-; VI-NEXT: .LBB109_3:
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s75, s29, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s49, s9, s8
+; VI-NEXT: s_lshl_b32 s6, s28, 16
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_and_b32 s8, s49, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s7, 0x7fff
+; VI-NEXT: s_or_b32 s10, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s6, s10, s9
+; VI-NEXT: s_lshr_b32 s28, s6, 16
+; VI-NEXT: s_and_b32 s6, s5, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s74, s28, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s54, s9, s8
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: v_add_f32_e32 v2, s5, v1
+; VI-NEXT: v_readfirstlane_b32 s5, v2
+; VI-NEXT: s_bfe_u32 s6, s5, 0x10010
+; VI-NEXT: s_add_i32 s6, s6, s5
+; VI-NEXT: s_and_b32 s8, s54, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s6, 0x7fff
+; VI-NEXT: s_bitset1_b32 s5, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s5, s5, s9
+; VI-NEXT: s_and_b32 s6, s4, 0xffff0000
+; VI-NEXT: v_add_f32_e32 v2, s6, v1
+; VI-NEXT: v_readfirstlane_b32 s6, v2
+; VI-NEXT: s_bfe_u32 s7, s6, 0x10010
+; VI-NEXT: s_lshr_b32 s5, s5, 16
+; VI-NEXT: s_add_i32 s7, s7, s6
+; VI-NEXT: s_or_b32 s35, s5, s8
+; VI-NEXT: s_add_i32 s8, s7, 0x7fff
+; VI-NEXT: s_or_b32 s9, s6, 0x400000
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s66, s9, s8
+; VI-NEXT: s_lshl_b32 s4, s4, 16
+; VI-NEXT: v_add_f32_e32 v1, s4, v1
+; VI-NEXT: v_readfirstlane_b32 s4, v1
+; VI-NEXT: s_bfe_u32 s6, s4, 0x10010
+; VI-NEXT: s_add_i32 s6, s6, s4
+; VI-NEXT: s_and_b32 s8, s66, 0xffff0000
+; VI-NEXT: s_add_i32 s9, s6, 0x7fff
+; VI-NEXT: s_bitset1_b32 s4, 22
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: s_and_b64 s[6:7], vcc, exec
+; VI-NEXT: s_cselect_b32 s4, s4, s9
+; VI-NEXT: s_lshr_b32 s4, s4, 16
+; VI-NEXT: s_or_b32 s34, s4, s8
+; VI-NEXT: s_lshr_b64 s[6:7], s[34:35], 24
+; VI-NEXT: s_lshr_b64 s[8:9], s[74:75], 24
+; VI-NEXT: s_lshr_b64 s[10:11], s[72:73], 24
+; VI-NEXT: s_lshr_b64 s[12:13], s[62:63], 24
+; VI-NEXT: s_lshr_b64 s[14:15], s[60:61], 24
+; VI-NEXT: s_lshr_b64 s[40:41], s[58:59], 24
+; VI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24
+; VI-NEXT: s_lshr_b64 s[44:45], s[46:47], 24
+; VI-NEXT: s_lshr_b32 s76, s35, 8
+; VI-NEXT: s_lshr_b32 s77, s34, 8
+; VI-NEXT: s_lshr_b32 s75, s75, 8
+; VI-NEXT: s_lshr_b32 s74, s74, 8
+; VI-NEXT: s_lshr_b32 s73, s73, 8
+; VI-NEXT: s_lshr_b32 s72, s72, 8
+; VI-NEXT: s_lshr_b32 s63, s63, 8
+; VI-NEXT: s_lshr_b32 s79, s62, 8
+; VI-NEXT: s_lshr_b32 s89, s61, 8
+; VI-NEXT: s_lshr_b32 s30, s60, 8
+; VI-NEXT: s_lshr_b32 s34, s59, 8
+; VI-NEXT: s_lshr_b32 s36, s58, 8
+; VI-NEXT: s_lshr_b32 s38, s57, 8
+; VI-NEXT: s_lshr_b32 s48, s56, 8
+; VI-NEXT: s_lshr_b32 s51, s47, 8
+; VI-NEXT: s_lshr_b32 s53, s46, 8
+; VI-NEXT: s_lshr_b32 s56, s54, 24
+; VI-NEXT: s_lshr_b32 s57, s54, 16
+; VI-NEXT: s_lshr_b32 s58, s66, 16
+; VI-NEXT: s_lshr_b32 s59, s90, 24
+; VI-NEXT: s_lshr_b32 s60, s90, 16
+; VI-NEXT: s_lshr_b32 s61, s49, 16
+; VI-NEXT: s_lshr_b32 s62, s78, 24
+; VI-NEXT: s_lshr_b32 s78, s78, 16
+; VI-NEXT: s_lshr_b32 s88, s88, 16
+; VI-NEXT: s_lshr_b32 s90, s91, 24
+; VI-NEXT: s_lshr_b32 s91, s91, 16
+; VI-NEXT: s_lshr_b32 s31, s31, 16
+; VI-NEXT: s_lshr_b32 s35, s37, 24
+; VI-NEXT: s_lshr_b32 s37, s37, 16
+; VI-NEXT: s_lshr_b32 s39, s39, 16
+; VI-NEXT: s_lshr_b32 s49, s50, 24
+; VI-NEXT: s_lshr_b32 s50, s50, 16
+; VI-NEXT: s_lshr_b32 s52, s52, 16
+; VI-NEXT: s_lshr_b32 s54, s55, 24
+; VI-NEXT: s_lshr_b32 s55, s55, 16
+; VI-NEXT: s_lshr_b32 s64, s64, 16
+; VI-NEXT: s_lshr_b32 s66, s65, 24
+; VI-NEXT: s_lshr_b32 s65, s65, 16
+; VI-NEXT: s_lshr_b32 s67, s67, 16
+; VI-NEXT: .LBB109_3: ; %end
+; VI-NEXT: s_and_b32 s7, s16, 0xff
+; VI-NEXT: s_lshl_b32 s9, s53, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s67, 0xff
+; VI-NEXT: s_lshl_b32 s11, s44, 8
+; VI-NEXT: s_or_b32 s9, s9, s11
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_and_b32 s7, s17, 0xff
+; VI-NEXT: s_lshl_b32 s9, s51, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s65, 0xff
+; VI-NEXT: s_lshl_b32 s11, s66, 8
+; VI-NEXT: s_or_b32 s9, s9, s11
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s18, 0xff
+; VI-NEXT: s_lshl_b32 s9, s48, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s64, 0xff
+; VI-NEXT: s_lshl_b32 s11, s42, 8
+; VI-NEXT: s_or_b32 s9, s9, s11
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s19, 0xff
+; VI-NEXT: s_lshl_b32 s9, s38, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s55, 0xff
+; VI-NEXT: s_lshl_b32 s11, s54, 8
+; VI-NEXT: s_or_b32 s9, s9, s11
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s20, 0xff
+; VI-NEXT: s_lshl_b32 s9, s36, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s52, 0xff
+; VI-NEXT: s_lshl_b32 s11, s40, 8
+; VI-NEXT: s_or_b32 s9, s9, s11
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s21, 0xff
+; VI-NEXT: s_lshl_b32 s9, s34, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s50, 0xff
+; VI-NEXT: s_lshl_b32 s11, s49, 8
+; VI-NEXT: s_or_b32 s9, s9, s11
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s22, 0xff
+; VI-NEXT: s_lshl_b32 s9, s30, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s39, 0xff
+; VI-NEXT: s_lshl_b32 s11, s14, 8
+; VI-NEXT: s_or_b32 s9, s9, s11
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s23, 0xff
+; VI-NEXT: s_lshl_b32 s9, s89, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s37, 0xff
+; VI-NEXT: s_lshl_b32 s11, s35, 8
+; VI-NEXT: s_or_b32 s9, s9, s11
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s24, 0xff
+; VI-NEXT: s_lshl_b32 s9, s79, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s31, 0xff
+; VI-NEXT: s_lshl_b32 s11, s12, 8
+; VI-NEXT: s_or_b32 s9, s9, s11
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s25, 0xff
+; VI-NEXT: s_lshl_b32 s9, s63, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s91, 0xff
+; VI-NEXT: s_lshl_b32 s11, s90, 8
+; VI-NEXT: s_or_b32 s9, s9, s11
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s26, 0xff
+; VI-NEXT: s_lshl_b32 s9, s72, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s88, 0xff
+; VI-NEXT: s_lshl_b32 s10, s10, 8
+; VI-NEXT: s_or_b32 s9, s9, s10
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s27, 0xff
+; VI-NEXT: s_lshl_b32 s9, s73, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s78, 0xff
+; VI-NEXT: s_lshl_b32 s10, s62, 8
+; VI-NEXT: s_or_b32 s9, s9, s10
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s9, s9, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s28, 0xff
+; VI-NEXT: s_lshl_b32 s9, s74, 8
+; VI-NEXT: s_or_b32 s7, s7, s9
+; VI-NEXT: s_and_b32 s9, s61, 0xff
+; VI-NEXT: s_lshl_b32 s8, s8, 8
+; VI-NEXT: s_or_b32 s8, s9, s8
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s8, s8, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s7, s29, 0xff
+; VI-NEXT: s_lshl_b32 s8, s75, 8
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: s_and_b32 s8, s60, 0xff
+; VI-NEXT: s_lshl_b32 s9, s59, 8
+; VI-NEXT: s_or_b32 s8, s8, s9
+; VI-NEXT: s_and_b32 s7, s7, 0xffff
+; VI-NEXT: s_lshl_b32 s8, s8, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0
+; VI-NEXT: s_or_b32 s7, s7, s8
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: s_and_b32 s4, s4, 0xff
+; VI-NEXT: s_lshl_b32 s7, s77, 8
+; VI-NEXT: s_or_b32 s4, s4, s7
+; VI-NEXT: s_and_b32 s7, s58, 0xff
+; VI-NEXT: s_lshl_b32 s6, s6, 8
+; VI-NEXT: s_or_b32 s6, s7, s6
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_lshl_b32 s6, s6, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0
+; VI-NEXT: s_or_b32 s4, s4, s6
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_and_b32 s4, s5, 0xff
+; VI-NEXT: s_lshl_b32 s5, s76, 8
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: s_and_b32 s5, s57, 0xff
+; VI-NEXT: s_lshl_b32 s6, s56, 8
+; VI-NEXT: s_or_b32 s5, s5, s6
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT: v_readlane_b32 s67, v4, 19
+; VI-NEXT: v_readlane_b32 s66, v4, 18
+; VI-NEXT: v_readlane_b32 s65, v4, 17
+; VI-NEXT: v_readlane_b32 s64, v4, 16
+; VI-NEXT: v_readlane_b32 s55, v4, 15
+; VI-NEXT: v_readlane_b32 s54, v4, 14
+; VI-NEXT: v_readlane_b32 s53, v4, 13
+; VI-NEXT: v_readlane_b32 s52, v4, 12
+; VI-NEXT: v_readlane_b32 s51, v4, 11
+; VI-NEXT: v_readlane_b32 s50, v4, 10
+; VI-NEXT: v_readlane_b32 s49, v4, 9
+; VI-NEXT: v_readlane_b32 s48, v4, 8
+; VI-NEXT: v_readlane_b32 s39, v4, 7
+; VI-NEXT: v_readlane_b32 s38, v4, 6
+; VI-NEXT: v_readlane_b32 s37, v4, 5
+; VI-NEXT: v_readlane_b32 s36, v4, 4
+; VI-NEXT: v_readlane_b32 s35, v4, 3
+; VI-NEXT: v_readlane_b32 s34, v4, 2
+; VI-NEXT: v_readlane_b32 s31, v4, 1
+; VI-NEXT: v_readlane_b32 s30, v4, 0
+; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT: s_mov_b64 exec, s[4:5]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+; VI-NEXT: .LBB109_4:
+; VI-NEXT: ; implicit-def: $sgpr53
; VI-NEXT: ; implicit-def: $sgpr67
+; VI-NEXT: ; implicit-def: $sgpr44
+; VI-NEXT: ; implicit-def: $sgpr51
; VI-NEXT: ; implicit-def: $sgpr65
-; VI-NEXT: ; implicit-def: $sgpr6
; VI-NEXT: ; implicit-def: $sgpr66
+; VI-NEXT: ; implicit-def: $sgpr48
; VI-NEXT: ; implicit-def: $sgpr64
+; VI-NEXT: ; implicit-def: $sgpr42
+; VI-NEXT: ; implicit-def: $sgpr38
; VI-NEXT: ; implicit-def: $sgpr55
; VI-NEXT: ; implicit-def: $sgpr54
+; VI-NEXT: ; implicit-def: $sgpr36
; VI-NEXT: ; implicit-def: $sgpr52
-; VI-NEXT: ; implicit-def: $sgpr8
-; VI-NEXT: ; implicit-def: $sgpr53
-; VI-NEXT: ; implicit-def: $sgpr51
+; VI-NEXT: ; implicit-def: $sgpr40
+; VI-NEXT: ; implicit-def: $sgpr34
; VI-NEXT: ; implicit-def: $sgpr50
; VI-NEXT: ; implicit-def: $sgpr49
+; VI-NEXT: ; implicit-def: $sgpr30
; VI-NEXT: ; implicit-def: $sgpr39
-; VI-NEXT: ; implicit-def: $sgpr10
-; VI-NEXT: ; implicit-def: $sgpr48
-; VI-NEXT: ; implicit-def: $sgpr38
+; VI-NEXT: ; implicit-def: $sgpr14
+; VI-NEXT: ; implicit-def: $sgpr89
; VI-NEXT: ; implicit-def: $sgpr37
-; VI-NEXT: ; implicit-def: $sgpr36
-; VI-NEXT: ; implicit-def: $sgpr34
-; VI-NEXT: ; implicit-def: $sgpr12
; VI-NEXT: ; implicit-def: $sgpr35
+; VI-NEXT: ; implicit-def: $sgpr79
; VI-NEXT: ; implicit-def: $sgpr31
-; VI-NEXT: ; implicit-def: $sgpr30
+; VI-NEXT: ; implicit-def: $sgpr12
+; VI-NEXT: ; implicit-def: $sgpr63
; VI-NEXT: ; implicit-def: $sgpr91
-; VI-NEXT: ; implicit-def: $sgpr89
-; VI-NEXT: ; implicit-def: $sgpr14
; VI-NEXT: ; implicit-def: $sgpr90
+; VI-NEXT: ; implicit-def: $sgpr72
; VI-NEXT: ; implicit-def: $sgpr88
-; VI-NEXT: ; implicit-def: $sgpr79
-; VI-NEXT: ; implicit-def: $sgpr78
-; VI-NEXT: ; implicit-def: $sgpr76
-; VI-NEXT: ; implicit-def: $sgpr40
-; VI-NEXT: ; implicit-def: $sgpr77
-; VI-NEXT: ; implicit-def: $sgpr75
-; VI-NEXT: ; implicit-def: $sgpr74
+; VI-NEXT: ; implicit-def: $sgpr10
; VI-NEXT: ; implicit-def: $sgpr73
-; VI-NEXT: ; implicit-def: $sgpr63
-; VI-NEXT: ; implicit-def: $sgpr42
-; VI-NEXT: ; implicit-def: $sgpr72
+; VI-NEXT: ; implicit-def: $sgpr78
; VI-NEXT: ; implicit-def: $sgpr62
+; VI-NEXT: ; implicit-def: $sgpr74
; VI-NEXT: ; implicit-def: $sgpr61
+; VI-NEXT: ; implicit-def: $sgpr8
+; VI-NEXT: ; implicit-def: $sgpr75
; VI-NEXT: ; implicit-def: $sgpr60
-; VI-NEXT: ; implicit-def: $sgpr58
-; VI-NEXT: ; implicit-def: $sgpr44
; VI-NEXT: ; implicit-def: $sgpr59
+; VI-NEXT: ; implicit-def: $sgpr77
+; VI-NEXT: ; implicit-def: $sgpr58
+; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: ; implicit-def: $sgpr76
; VI-NEXT: ; implicit-def: $sgpr57
; VI-NEXT: ; implicit-def: $sgpr56
; VI-NEXT: s_branch .LBB109_2
-; VI-NEXT: .LBB109_4:
-; VI-NEXT: v_mov_b32_e32 v19, s44
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v19, s42
-; VI-NEXT: v_mov_b32_e32 v1, s16
-; VI-NEXT: v_mov_b32_e32 v2, s17
-; VI-NEXT: v_mov_b32_e32 v3, s18
-; VI-NEXT: v_mov_b32_e32 v4, s19
-; VI-NEXT: v_mov_b32_e32 v5, s20
-; VI-NEXT: v_mov_b32_e32 v6, s21
-; VI-NEXT: v_mov_b32_e32 v7, s22
-; VI-NEXT: v_mov_b32_e32 v8, s23
-; VI-NEXT: v_mov_b32_e32 v9, s24
-; VI-NEXT: v_mov_b32_e32 v10, s25
-; VI-NEXT: v_mov_b32_e32 v11, s26
-; VI-NEXT: v_mov_b32_e32 v12, s27
-; VI-NEXT: v_mov_b32_e32 v13, s28
-; VI-NEXT: v_mov_b32_e32 v14, s29
-; VI-NEXT: v_mov_b32_e32 v15, s4
-; VI-NEXT: v_mov_b32_e32 v16, s5
-; VI-NEXT: v_mov_b32_e32 v18, s67
-; VI-NEXT: v_mov_b32_e32 v62, s65
-; VI-NEXT: v_mov_b32_e32 v17, s66
-; VI-NEXT: v_mov_b32_e32 v60, s64
-; VI-NEXT: v_mov_b32_e32 v61, s55
-; VI-NEXT: v_mov_b32_e32 v58, s54
-; VI-NEXT: v_mov_b32_e32 v59, s52
-; VI-NEXT: v_mov_b32_e32 v57, s53
-; VI-NEXT: v_mov_b32_e32 v47, s51
-; VI-NEXT: v_mov_b32_e32 v56, s50
-; VI-NEXT: v_mov_b32_e32 v46, s49
-; VI-NEXT: v_mov_b32_e32 v45, s39
-; VI-NEXT: v_mov_b32_e32 v44, s48
-; VI-NEXT: v_mov_b32_e32 v42, s38
-; VI-NEXT: v_mov_b32_e32 v43, s37
-; VI-NEXT: v_mov_b32_e32 v41, s36
-; VI-NEXT: v_mov_b32_e32 v40, s34
-; VI-NEXT: v_mov_b32_e32 v55, s35
-; VI-NEXT: v_mov_b32_e32 v53, s31
-; VI-NEXT: v_mov_b32_e32 v54, s30
-; VI-NEXT: v_mov_b32_e32 v52, s91
-; VI-NEXT: v_mov_b32_e32 v51, s89
-; VI-NEXT: v_mov_b32_e32 v50, s90
-; VI-NEXT: v_mov_b32_e32 v48, s88
-; VI-NEXT: v_mov_b32_e32 v49, s79
-; VI-NEXT: v_mov_b32_e32 v39, s78
-; VI-NEXT: v_mov_b32_e32 v38, s76
-; VI-NEXT: v_mov_b32_e32 v37, s77
-; VI-NEXT: v_mov_b32_e32 v35, s75
-; VI-NEXT: v_mov_b32_e32 v36, s74
-; VI-NEXT: v_mov_b32_e32 v34, s73
-; VI-NEXT: v_mov_b32_e32 v33, s63
-; VI-NEXT: v_mov_b32_e32 v32, s72
-; VI-NEXT: v_mov_b32_e32 v30, s62
-; VI-NEXT: v_mov_b32_e32 v31, s61
-; VI-NEXT: v_mov_b32_e32 v29, s60
-; VI-NEXT: v_mov_b32_e32 v28, s58
-; VI-NEXT: v_mov_b32_e32 v27, s59
-; VI-NEXT: v_mov_b32_e32 v25, s57
-; VI-NEXT: v_mov_b32_e32 v26, s56
-; VI-NEXT: v_mov_b32_e32 v21, s12
-; VI-NEXT: v_mov_b32_e32 v22, s10
-; VI-NEXT: v_mov_b32_e32 v23, s8
-; VI-NEXT: v_mov_b32_e32 v24, s6
-; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: v_mov_b32_e32 v19, s40
-; VI-NEXT: v_mov_b32_e32 v20, s14
-; VI-NEXT: .LBB109_5: ; %end
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v24
-; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61
-; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v58
-; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56
-; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22
-; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43
-; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v21
-; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v54
-; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20
-; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49
-; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v19
-; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v36
-; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34
-; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_readlane_b32 s67, v63, 19
-; VI-NEXT: v_readlane_b32 s66, v63, 18
-; VI-NEXT: v_readlane_b32 s65, v63, 17
-; VI-NEXT: v_readlane_b32 s64, v63, 16
-; VI-NEXT: v_readlane_b32 s55, v63, 15
-; VI-NEXT: v_readlane_b32 s54, v63, 14
-; VI-NEXT: v_readlane_b32 s53, v63, 13
-; VI-NEXT: v_readlane_b32 s52, v63, 12
-; VI-NEXT: v_readlane_b32 s51, v63, 11
-; VI-NEXT: v_readlane_b32 s50, v63, 10
-; VI-NEXT: v_readlane_b32 s49, v63, 9
-; VI-NEXT: v_readlane_b32 s48, v63, 8
-; VI-NEXT: v_readlane_b32 s39, v63, 7
-; VI-NEXT: v_readlane_b32 s38, v63, 6
-; VI-NEXT: v_readlane_b32 s37, v63, 5
-; VI-NEXT: v_readlane_b32 s36, v63, 4
-; VI-NEXT: v_readlane_b32 s35, v63, 3
-; VI-NEXT: v_readlane_b32 s34, v63, 2
-; VI-NEXT: v_readlane_b32 s31, v63, 1
-; VI-NEXT: v_readlane_b32 s30, v63, 0
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31
-; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29
-; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0
-; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26
-; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0
-; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT: s_or_saveexec_b64 s[4:5], -1
-; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar:
; GFX9: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 6fe66655de3d6..7e3fc4805ef48 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -2014,10 +2014,10 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -2029,29 +2029,33 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB22_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB22_2
; SI-NEXT: .LBB22_4: ; %cmp.true
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2064,12 +2068,12 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB22_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -2080,15 +2084,15 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -2098,9 +2102,9 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB22_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -2273,30 +2277,34 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s20, 0
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
; SI-NEXT: s_cbranch_scc0 .LBB23_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v6
; SI-NEXT: s_cbranch_execnz .LBB23_3
; SI-NEXT: .LBB23_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: .LBB23_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB23_4:
@@ -2311,7 +2319,7 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB23_4
; VI-NEXT: .LBB23_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -2319,7 +2327,7 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -2327,15 +2335,15 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -2345,9 +2353,9 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB23_3:
; VI-NEXT: s_branch .LBB23_2
@@ -5158,10 +5166,10 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -5173,29 +5181,33 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB46_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB46_2
; SI-NEXT: .LBB46_4: ; %cmp.true
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -5208,12 +5220,12 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB46_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -5224,15 +5236,15 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -5242,9 +5254,9 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB46_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -5417,30 +5429,34 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s20, 0
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
; SI-NEXT: s_cbranch_scc0 .LBB47_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v6
; SI-NEXT: s_cbranch_execnz .LBB47_3
; SI-NEXT: .LBB47_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: .LBB47_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB47_4:
@@ -5455,7 +5471,7 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB47_4
; VI-NEXT: .LBB47_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -5463,7 +5479,7 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -5471,15 +5487,15 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -5489,9 +5505,9 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB47_3:
; VI-NEXT: s_branch .LBB47_2
@@ -8014,10 +8030,10 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -8029,29 +8045,33 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB66_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB66_2
; SI-NEXT: .LBB66_4: ; %cmp.true
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -8064,12 +8084,12 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB66_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -8080,15 +8100,15 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -8098,9 +8118,9 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB66_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -8273,30 +8293,34 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s20, 0
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
; SI-NEXT: s_cbranch_scc0 .LBB67_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v6
; SI-NEXT: s_cbranch_execnz .LBB67_3
; SI-NEXT: .LBB67_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: .LBB67_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB67_4:
@@ -8311,7 +8335,7 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB67_4
; VI-NEXT: .LBB67_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -8319,7 +8343,7 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -8327,15 +8351,15 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -8345,9 +8369,9 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB67_3:
; VI-NEXT: s_branch .LBB67_2
@@ -10543,10 +10567,10 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0
+; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1
+; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -10558,29 +10582,33 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB82_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB82_2
; SI-NEXT: .LBB82_4: ; %cmp.true
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -10593,12 +10621,12 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB82_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -10609,15 +10637,15 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -10627,9 +10655,9 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB82_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -10802,30 +10830,34 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s20, 0
-; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
; SI-NEXT: s_cbranch_scc0 .LBB83_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v6
; SI-NEXT: s_cbranch_execnz .LBB83_3
; SI-NEXT: .LBB83_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: .LBB83_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB83_4:
@@ -10840,7 +10872,7 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB83_4
; VI-NEXT: .LBB83_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -10848,7 +10880,7 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -10856,15 +10888,15 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -10874,9 +10906,9 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB83_3:
; VI-NEXT: s_branch .LBB83_2
@@ -12733,19 +12765,21 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB94_2
; SI-NEXT: .LBB94_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v4
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -12792,10 +12826,10 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB94_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -12983,19 +13017,21 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
; SI-NEXT: s_cbranch_execnz .LBB95_3
; SI-NEXT: .LBB95_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v4
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: .LBB95_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -13028,12 +13064,11 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; VI-NEXT: v_add_f32_e32 v1, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
@@ -13048,9 +13083,10 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16
-; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB95_3:
; VI-NEXT: s_branch .LBB95_2
@@ -14617,10 +14653,10 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB102_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -14856,12 +14892,11 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; VI-NEXT: v_add_f32_e32 v1, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
@@ -14876,9 +14911,10 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16
-; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB103_3:
; VI-NEXT: s_branch .LBB103_2
@@ -16095,10 +16131,10 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v3
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
@@ -16117,15 +16153,18 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB108_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
-; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT: v_alignbit_b32 v4, v6, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: ; implicit-def: $vgpr9
@@ -16133,18 +16172,21 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB108_2
; SI-NEXT: .LBB108_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
@@ -16178,50 +16220,51 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB108_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v9
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9
-; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
-; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v9
+; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
; VI-NEXT: s_movk_i32 s6, 0x7fff
-; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; VI-NEXT: v_or_b32_e32 v1, v9, v0
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
-; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; VI-NEXT: v_bfe_u32 v4, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; VI-NEXT: v_cndmask_b32_e32 v10, v3, v4, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v1, v6, v2, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; VI-NEXT: v_or_b32_e32 v0, v8, v0
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
-; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v10
; VI-NEXT: .LBB108_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, v8
@@ -16471,35 +16514,41 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s20, 0
-; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18
+; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
; SI-NEXT: s_cbranch_scc0 .LBB109_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
-; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT: v_alignbit_b32 v4, v6, v9, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
; SI-NEXT: s_cbranch_execnz .LBB109_3
; SI-NEXT: .LBB109_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
@@ -16526,74 +16575,75 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24
; VI-NEXT: s_lshr_b32 s8, s17, 24
-; VI-NEXT: s_lshr_b32 s5, s17, 16
-; VI-NEXT: s_lshr_b32 s9, s17, 8
-; VI-NEXT: s_lshr_b32 s10, s16, 16
-; VI-NEXT: s_lshr_b32 s11, s16, 8
+; VI-NEXT: s_lshr_b32 s10, s17, 16
+; VI-NEXT: s_lshr_b32 s5, s17, 8
+; VI-NEXT: s_lshr_b32 s11, s16, 16
+; VI-NEXT: s_lshr_b32 s9, s16, 8
; VI-NEXT: s_cbranch_execnz .LBB109_4
; VI-NEXT: .LBB109_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: v_cndmask_b32_e32 v7, v2, v3, vcc
+; VI-NEXT: v_add_f32_e32 v2, s4, v0
+; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_e32 v2, v8, v1
; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_add_f32_e32 v0, s4, v0
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
-; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
-; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc
+; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v2, v6, v3, 16
-; VI-NEXT: v_alignbit_b32 v1, v0, v4, 16
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; VI-NEXT: v_or_b32_e32 v1, v0, v1
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v7
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v9
; VI-NEXT: v_mov_b32_e32 v4, v8
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB109_3:
+; VI-NEXT: ; implicit-def: $sgpr9
; VI-NEXT: ; implicit-def: $sgpr11
-; VI-NEXT: ; implicit-def: $sgpr10
; VI-NEXT: ; implicit-def: $sgpr4
-; VI-NEXT: ; implicit-def: $sgpr9
; VI-NEXT: ; implicit-def: $sgpr5
+; VI-NEXT: ; implicit-def: $sgpr10
; VI-NEXT: ; implicit-def: $sgpr8
; VI-NEXT: s_branch .LBB109_2
; VI-NEXT: .LBB109_4:
-; VI-NEXT: v_mov_b32_e32 v1, s11
-; VI-NEXT: v_mov_b32_e32 v2, s10
-; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v2, s11
+; VI-NEXT: v_mov_b32_e32 v6, s10
; VI-NEXT: v_mov_b32_e32 v7, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_mov_b32_e32 v3, s4
-; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: v_mov_b32_e32 v6, s5
; VI-NEXT: v_mov_b32_e32 v4, s17
; VI-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index e5245f7bd71d3..18c503cc7a6ed 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -1896,12 +1896,12 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -1913,12 +1913,15 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB10_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v0, v9, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: ; implicit-def: $vgpr7
@@ -1928,24 +1931,27 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB10_2
; SI-NEXT: .LBB10_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1958,12 +1964,12 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB10_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -1974,15 +1980,15 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -1992,15 +1998,15 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -2010,9 +2016,9 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB10_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -2228,40 +2234,46 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s22, 0
-; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21
; SI-NEXT: s_cbranch_scc0 .LBB11_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v9
; SI-NEXT: s_cbranch_execnz .LBB11_3
; SI-NEXT: .LBB11_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: .LBB11_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB11_4:
@@ -2276,7 +2288,7 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB11_4
; VI-NEXT: .LBB11_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s18, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -2284,7 +2296,7 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -2292,17 +2304,17 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; VI-NEXT: v_add_f32_e32 v3, s4, v0
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
@@ -2310,15 +2322,15 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v3, s4, v0
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -2328,9 +2340,9 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB11_3:
; VI-NEXT: s_branch .LBB11_2
@@ -5039,12 +5051,12 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3
; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -5056,12 +5068,15 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB26_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v0, v9, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: ; implicit-def: $vgpr7
@@ -5071,24 +5086,27 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB26_2
; SI-NEXT: .LBB26_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -5101,12 +5119,12 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB26_2
; VI-NEXT: ; %bb.1: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
@@ -5117,15 +5135,15 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
@@ -5135,15 +5153,15 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
-; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
@@ -5153,9 +5171,9 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB26_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -5371,40 +5389,46 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s22, 0
-; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21
; SI-NEXT: s_cbranch_scc0 .LBB27_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16
-; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v9
; SI-NEXT: s_cbranch_execnz .LBB27_3
; SI-NEXT: .LBB27_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: .LBB27_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB27_4:
@@ -5419,7 +5443,7 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: s_cbranch_execnz .LBB27_4
; VI-NEXT: .LBB27_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s18, 16
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
@@ -5427,7 +5451,7 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; VI-NEXT: v_add_f32_e32 v2, s4, v0
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
@@ -5435,17 +5459,17 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v1, s4, v0
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; VI-NEXT: v_add_f32_e32 v3, s4, v0
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
@@ -5453,15 +5477,15 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16
+; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f32_e32 v3, s4, v0
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
+; VI-NEXT: s_lshl_b32 s4, s16, 16
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; VI-NEXT: v_add_f32_e32 v0, s4, v0
@@ -5471,9 +5495,9 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB27_3:
; VI-NEXT: s_branch .LBB27_2
@@ -7465,12 +7489,12 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v17, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1
; SI-NEXT: v_mul_f32_e32 v15, 1.0, v2
+; SI-NEXT: v_mul_f32_e32 v13, 1.0, v3
+; SI-NEXT: v_mul_f32_e32 v14, 1.0, v4
; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $vgpr2
@@ -7493,46 +7517,56 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB38_3: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12
-; SI-NEXT: v_alignbit_b32 v0, v0, v17, 16
-; SI-NEXT: v_alignbit_b32 v4, v6, v15, 16
-; SI-NEXT: v_alignbit_b32 v8, v10, v13, 16
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
+; SI-NEXT: v_or_b32_e32 v8, v6, v7
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14
-; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v13
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12
; SI-NEXT: ; implicit-def: $vgpr17
; SI-NEXT: ; implicit-def: $vgpr16
; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr12
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB38_2
; SI-NEXT: .LBB38_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12
-; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v8, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
@@ -7546,9 +7580,9 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
; VI-LABEL: bitcast_v6bf16_to_v12i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v13, v2
-; VI-NEXT: v_mov_b32_e32 v16, v1
-; VI-NEXT: v_mov_b32_e32 v15, v0
+; VI-NEXT: v_mov_b32_e32 v8, v2
+; VI-NEXT: v_mov_b32_e32 v14, v1
+; VI-NEXT: v_mov_b32_e32 v13, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: ; implicit-def: $vgpr2
@@ -7563,89 +7597,91 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB38_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v13
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v16
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v16
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14]
-; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16]
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v15
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9]
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14]
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13
; VI-NEXT: .LBB38_2: ; %Flow
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB38_4
; VI-NEXT: ; %bb.3: ; %cmp.true
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
+; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v14
; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
+; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
; VI-NEXT: s_movk_i32 s6, 0x7fff
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v16, v1, v0, 16
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15
-; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
-; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
-; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v15, v1, v0, 16
-; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; VI-NEXT: v_or_b32_e32 v1, v14, v0
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13
; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
-; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13
-; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v13, v1, v0, 16
-; VI-NEXT: v_mov_b32_e32 v14, 0x7fc07fc0
-; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16]
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14]
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13
-; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v13
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v16
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v16
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v16
+; VI-NEXT: v_cndmask_b32_e32 v15, v3, v4, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v13
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
+; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v10, v4, v5, vcc
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5
+; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; VI-NEXT: v_or_b32_e32 v0, v13, v0
+; VI-NEXT: v_or_b32_e32 v6, v8, v3
+; VI-NEXT: v_mov_b32_e32 v7, 0x7fc07fc0
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1]
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7]
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v15
; VI-NEXT: .LBB38_4: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, v15
-; VI-NEXT: v_mov_b32_e32 v4, v16
-; VI-NEXT: v_mov_b32_e32 v8, v13
+; VI-NEXT: v_mov_b32_e32 v0, v13
+; VI-NEXT: v_mov_b32_e32 v4, v14
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v6bf16_to_v12i8:
@@ -7981,47 +8017,57 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s22, 0
-; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v17, 1.0, s16
-; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19
+; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s18
-; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21
+; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19
; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20
+; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21
; SI-NEXT: s_cbranch_scc0 .LBB39_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14
-; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12
-; SI-NEXT: v_alignbit_b32 v0, v0, v17, 16
-; SI-NEXT: v_alignbit_b32 v4, v6, v15, 16
-; SI-NEXT: v_alignbit_b32 v8, v10, v13, 16
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
+; SI-NEXT: v_or_b32_e32 v8, v6, v7
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14
-; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8
+; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14
+; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12
+; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12
; SI-NEXT: s_cbranch_execnz .LBB39_3
; SI-NEXT: .LBB39_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
-; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12
-; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16
+; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT: v_or_b32_e32 v4, v1, v2
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13
-; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11
+; SI-NEXT: v_or_b32_e32 v8, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16
; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24
; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16
; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8
@@ -8052,111 +8098,111 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3
; VI-NEXT: s_cmp_lg_u32 s19, 0
; VI-NEXT: s_cbranch_scc0 .LBB39_3
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s19, s16, 8
-; VI-NEXT: s_lshr_b32 s10, s18, 16
+; VI-NEXT: s_lshr_b32 s19, s18, 16
; VI-NEXT: s_lshr_b32 s11, s18, 8
-; VI-NEXT: s_lshr_b32 s12, s17, 24
+; VI-NEXT: s_lshr_b32 s14, s17, 24
; VI-NEXT: s_lshr_b32 s13, s17, 16
-; VI-NEXT: s_lshr_b32 s15, s17, 8
-; VI-NEXT: s_lshr_b32 s14, s16, 16
+; VI-NEXT: s_lshr_b32 s10, s17, 8
+; VI-NEXT: s_lshr_b32 s15, s16, 16
+; VI-NEXT: s_lshr_b32 s12, s16, 8
; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24
; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24
; VI-NEXT: s_cbranch_execnz .LBB39_4
; VI-NEXT: .LBB39_2: ; %cmp.true
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s16, 16
-; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
+; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000
+; VI-NEXT: v_add_f32_e32 v0, s4, v3
+; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: s_lshl_b32 s4, s17, 16
+; VI-NEXT: v_cndmask_b32_e32 v14, v1, v2, vcc
+; VI-NEXT: v_add_f32_e32 v1, s4, v3
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v1
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_add_f32_e32 v2, s4, v0
-; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
-; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16
-; VI-NEXT: v_add_f32_e32 v1, s4, v0
-; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_add_f32_e32 v0, s4, v0
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; VI-NEXT: v_or_b32_e32 v2, v13, v0
+; VI-NEXT: v_add_f32_e32 v0, s4, v3
+; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
+; VI-NEXT: s_lshl_b32 s4, s16, 16
+; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; VI-NEXT: v_add_f32_e32 v0, s4, v3
+; VI-NEXT: v_cndmask_b32_e32 v15, v1, v4, vcc
+; VI-NEXT: v_bfe_u32 v4, v0, 16, 1
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
+; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; VI-NEXT: v_add_f32_e32 v4, s4, v3
+; VI-NEXT: v_bfe_u32 v5, v4, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: s_lshl_b32 s4, s18, 16
+; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; VI-NEXT: v_add_f32_e32 v3, s4, v3
+; VI-NEXT: v_cndmask_b32_e32 v10, v5, v6, vcc
+; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v8, v0, v1, 16
-; VI-NEXT: v_mov_b32_e32 v9, 0x7fc07fc0
-; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15]
-; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9]
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8
-; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v15
-; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v15
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14
-; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v14
-; VI-NEXT: s_branch .LBB39_5
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; VI-NEXT: v_or_b32_e32 v1, v0, v1
+; VI-NEXT: v_or_b32_e32 v6, v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, 0x7fc07fc0
+; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7]
+; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v6
+; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15
+; VI-NEXT: v_mov_b32_e32 v4, v13
+; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB39_3:
-; VI-NEXT: ; implicit-def: $sgpr19
-; VI-NEXT: ; implicit-def: $sgpr14
-; VI-NEXT: ; implicit-def: $sgpr4
+; VI-NEXT: ; implicit-def: $sgpr12
; VI-NEXT: ; implicit-def: $sgpr15
+; VI-NEXT: ; implicit-def: $sgpr4
+; VI-NEXT: ; implicit-def: $sgpr10
; VI-NEXT: ; implicit-def: $sgpr13
-; VI-NEXT: ; implicit-def: $sgpr12
+; VI-NEXT: ; implicit-def: $sgpr14
; VI-NEXT: ; implicit-def: $sgpr11
-; VI-NEXT: ; implicit-def: $sgpr10
+; VI-NEXT: ; implicit-def: $sgpr19
; VI-NEXT: ; implicit-def: $sgpr6
; VI-NEXT: s_branch .LBB39_2
; VI-NEXT: .LBB39_4:
-; VI-NEXT: v_mov_b32_e32 v14, s16
-; VI-NEXT: v_mov_b32_e32 v15, s17
; VI-NEXT: v_mov_b32_e32 v8, s18
-; VI-NEXT: v_mov_b32_e32 v1, s19
-; VI-NEXT: v_mov_b32_e32 v2, s14
-; VI-NEXT: v_mov_b32_e32 v5, s15
+; VI-NEXT: v_mov_b32_e32 v10, s19
+; VI-NEXT: v_mov_b32_e32 v0, s16
+; VI-NEXT: v_mov_b32_e32 v2, s15
+; VI-NEXT: v_mov_b32_e32 v7, s14
; VI-NEXT: v_mov_b32_e32 v6, s13
-; VI-NEXT: v_mov_b32_e32 v7, s12
-; VI-NEXT: v_mov_b32_e32 v13, s11
-; VI-NEXT: v_mov_b32_e32 v10, s10
+; VI-NEXT: v_mov_b32_e32 v9, s11
+; VI-NEXT: v_mov_b32_e32 v1, s12
+; VI-NEXT: v_mov_b32_e32 v5, s10
; VI-NEXT: v_mov_b32_e32 v11, s6
; VI-NEXT: v_mov_b32_e32 v3, s4
-; VI-NEXT: .LBB39_5: ; %end
-; VI-NEXT: v_mov_b32_e32 v0, v14
-; VI-NEXT: v_mov_b32_e32 v4, v15
-; VI-NEXT: v_mov_b32_e32 v9, v13
+; VI-NEXT: v_mov_b32_e32 v4, s17
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bitcast_v6bf16_to_v12i8_scalar:
@@ -11368,12 +11414,12 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v5, 16
-; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB48_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -11675,12 +11721,11 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; VI-NEXT: v_add_f32_e32 v1, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
@@ -11692,12 +11737,11 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i
; VI-NEXT: v_bfe_u32 v5, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; VI-NEXT: v_add_f32_e32 v2, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
; VI-NEXT: v_bfe_u32 v6, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
@@ -11712,10 +11756,12 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16
-; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16
-; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB49_3:
; VI-NEXT: s_branch .LBB49_2
@@ -12234,25 +12280,29 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; SI-NEXT: s_cbranch_execz .LBB52_2
; SI-NEXT: .LBB52_4: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v4, v5, v2, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v6
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -12315,12 +12365,12 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v5, 16
-; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16
-; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: .LBB52_2: ; %end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -12563,25 +12613,29 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8
; SI-NEXT: s_cbranch_execnz .LBB53_3
; SI-NEXT: .LBB53_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
-; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT: v_alignbit_b32 v4, v5, v2, 16
+; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; SI-NEXT: v_or_b32_e32 v4, v2, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
-; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v6
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: .LBB53_3: ; %end
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -12616,12 +12670,11 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; VI-NEXT: s_lshl_b32 s4, s17, 16
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
-; VI-NEXT: s_lshl_b32 s4, s17, 16
-; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; VI-NEXT: v_add_f32_e32 v1, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
@@ -12633,12 +12686,11 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
; VI-NEXT: v_bfe_u32 v5, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; VI-NEXT: s_lshl_b32 s4, s18, 16
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; VI-NEXT: s_lshl_b32 s4, s18, 16
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; VI-NEXT: v_add_f32_e32 v2, s4, v0
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
; VI-NEXT: v_bfe_u32 v6, v2, 16, 1
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
@@ -12653,10 +12705,12 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16
-; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16
-; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB53_3:
; VI-NEXT: s_branch .LBB53_2
diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
index cc9f595f9d0b6..4c2fd3323220a 100644
--- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
@@ -6,46 +6,46 @@
define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) %arg1) local_unnamed_addr #0 {
; GFX6-LABEL: any_extend_vector_inreg_v16i8_to_v4i32:
; GFX6: ; %bb.0: ; %bb
-; GFX6-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x9
-; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s15, 0xf000
+; GFX6-NEXT: s_mov_b32 s14, -1
; GFX6-NEXT: v_mov_b32_e32 v0, 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s0, s14
-; GFX6-NEXT: s_mov_b32 s1, s15
-; GFX6-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0
+; GFX6-NEXT: s_mov_b32 s12, s10
+; GFX6-NEXT: s_mov_b32 s13, s11
+; GFX6-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_load_dword s4, s[12:13], 0x8
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:13
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:15
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:14
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:11
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:10
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:4
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:6
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:3
-; GFX6-NEXT: s_lshr_b32 s8, s9, 16
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s6
+; GFX6-NEXT: s_load_dword s0, s[8:9], 0x8
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:13
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:15
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:14
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:8
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:11
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:10
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:4
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:6
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_lshl_b64 s[6:7], s[4:5], 8
-; GFX6-NEXT: v_mov_b32_e32 v1, s11
-; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:9
+; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 8
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s7
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:9
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s1
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2
+; GFX6-NEXT: s_lshr_b32 s0, s2, 24
+; GFX6-NEXT: s_lshr_b32 s1, s5, 24
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
-; GFX6-NEXT: v_alignbit_b32 v0, s8, v0, 16
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:5
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v1, s7
-; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:12
+; GFX6-NEXT: v_mov_b32_e32 v0, s1
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:7
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:5
-; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:7
+; GFX6-NEXT: v_mov_b32_e32 v0, s9
+; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:12
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: any_extend_vector_inreg_v16i8_to_v4i32:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 3ca7db155b385..7896bdfbaeba8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -12144,13 +12144,14 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7LESS-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX7LESS-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7LESS-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7LESS-NEXT: v_add_f32_e32 v4, v4, v0
-; GFX7LESS-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7LESS-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7LESS-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GFX7LESS-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX7LESS-NEXT: v_add_f32_e32 v5, v2, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7LESS-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7LESS-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7LESS-NEXT: v_mov_b32_e32 v5, v3
; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2
; GFX7LESS-NEXT: buffer_atomic_cmpswap v[4:5], off, s[4:7], 0 glc
@@ -12165,10 +12166,11 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v3
+; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2
; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; GFX7LESS-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; GFX7LESS-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -12181,32 +12183,32 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s1, s[10:11], 0x0
-; GFX8-NEXT: s_lshl_b32 s12, s0, 16
-; GFX8-NEXT: s_and_b32 s13, s0, 0xffff0000
+; GFX8-NEXT: s_and_b32 s12, s0, 0xffff0000
+; GFX8-NEXT: s_lshl_b32 s13, s0, 16
; GFX8-NEXT: s_mov_b32 s4, s10
; GFX8-NEXT: s_mov_b32 s5, s11
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_add_f32_e32 v0, s12, v0
; GFX8-NEXT: v_add_f32_e32 v2, s13, v2
; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index cd6d741beeab3..f29077191f74d 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -936,8 +936,9 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
@@ -949,10 +950,11 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
@@ -998,11 +1000,12 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -1012,9 +1015,10 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
; GFX7-NEXT: s_mov_b32 s7, 0xf000
@@ -1071,10 +1075,12 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v1, v2, v3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v6
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
@@ -1086,14 +1092,16 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
@@ -1134,26 +1142,30 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v3, v6, v7
+; GCN-NEXT: v_or_b32_e32 v2, v4, v5
+; GCN-NEXT: v_or_b32_e32 v1, v11, v10
+; GCN-NEXT: v_or_b32_e32 v0, v0, v12
+; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16
-; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
-; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16
-; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1162,25 +1174,29 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_or_b32_e32 v5, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v0, v1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -1223,12 +1239,10 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
@@ -1236,24 +1250,34 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v2
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
-; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
-; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16
-; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16
-; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16
-; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16
-; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16
-; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16
+; GCN-NEXT: v_or_b32_e32 v3, v6, v7
+; GCN-NEXT: v_or_b32_e32 v2, v4, v5
+; GCN-NEXT: v_or_b32_e32 v1, v19, v18
+; GCN-NEXT: v_or_b32_e32 v0, v0, v20
+; GCN-NEXT: v_or_b32_e32 v7, v14, v15
+; GCN-NEXT: v_or_b32_e32 v6, v12, v13
+; GCN-NEXT: v_or_b32_e32 v5, v10, v11
+; GCN-NEXT: v_or_b32_e32 v4, v8, v9
; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -1263,41 +1287,49 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v5, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
-; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v14, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v13, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
-; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v12, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v11, v1, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
-; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1348,26 +1380,32 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16
-; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_or_b32_e32 v21, v22, v23
+; GCN-NEXT: v_or_b32_e32 v20, v20, v31
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_or_b32_e32 v19, v18, v19
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_or_b32_e32 v18, v16, v17
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5
-; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
-; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_or_b32_e32 v5, v6, v7
+; GCN-NEXT: v_or_b32_e32 v4, v4, v16
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -1381,45 +1419,55 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3
-; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GCN-NEXT: v_or_b32_e32 v3, v1, v0
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v30
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v9
; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16
-; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16
-; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16
-; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16
-; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16
-; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16
-; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GCN-NEXT: v_or_b32_e32 v2, v7, v6
+; GCN-NEXT: v_or_b32_e32 v9, v14, v15
+; GCN-NEXT: v_or_b32_e32 v8, v12, v13
+; GCN-NEXT: v_or_b32_e32 v7, v10, v11
+; GCN-NEXT: v_or_b32_e32 v6, v28, v27
+; GCN-NEXT: v_or_b32_e32 v12, v17, v16
+; GCN-NEXT: v_or_b32_e32 v11, v23, v22
+; GCN-NEXT: v_or_b32_e32 v10, v24, v25
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26
; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13
-; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v13
+; GCN-NEXT: v_or_b32_e32 v13, v29, v6
; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -1428,80 +1476,96 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-LABEL: v_store_global_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
-; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v13
-; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
+; GFX7-NEXT: v_or_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v0, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v6, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
+; GFX7-NEXT: v_or_b32_e32 v13, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v11, v1, v0
; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_alignbit_b32 v12, v7, v12, 16
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX7-NEXT: v_alignbit_b32 v11, v11, v10, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v28, v6, v7, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9
+; GFX7-NEXT: v_or_b32_e32 v12, v7, v6
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8
-; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v10, v7, v6
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22
-; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GFX7-NEXT: v_or_b32_e32 v9, v7, v6
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v8, v15, v8
+; GFX7-NEXT: v_or_b32_e32 v7, v7, v6
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_or_b32_e32 v16, v18, v16
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v31
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_or_b32_e32 v17, v15, v14
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_or_b32_e32 v15, v15, v14
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_or_b32_e32 v14, v18, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
@@ -1565,26 +1629,32 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16
-; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_or_b32_e32 v21, v22, v23
+; GCN-NEXT: v_or_b32_e32 v20, v20, v31
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_or_b32_e32 v19, v18, v19
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_or_b32_e32 v18, v16, v17
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13
-; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16
-; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_or_b32_e32 v13, v14, v15
+; GCN-NEXT: v_or_b32_e32 v12, v12, v16
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
@@ -1596,29 +1666,37 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
-; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16
-; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
-; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
-; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16
-; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16
-; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16
-; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v2
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_or_b32_e32 v11, v10, v11
+; GCN-NEXT: v_or_b32_e32 v10, v8, v9
+; GCN-NEXT: v_or_b32_e32 v3, v6, v7
+; GCN-NEXT: v_or_b32_e32 v2, v4, v5
+; GCN-NEXT: v_or_b32_e32 v1, v23, v22
+; GCN-NEXT: v_or_b32_e32 v0, v0, v26
+; GCN-NEXT: v_or_b32_e32 v6, v15, v14
+; GCN-NEXT: v_or_b32_e32 v5, v17, v16
; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128
@@ -1643,9 +1721,9 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_or_b32_e32 v4, v23, v4
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
@@ -1660,41 +1738,48 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16
-; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16
-; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16
-; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16
-; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16
-; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16
-; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v13
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_or_b32_e32 v13, v14, v7
+; GCN-NEXT: v_or_b32_e32 v12, v16, v15
+; GCN-NEXT: v_or_b32_e32 v11, v22, v17
+; GCN-NEXT: v_or_b32_e32 v10, v23, v10
+; GCN-NEXT: v_or_b32_e32 v17, v25, v24
+; GCN-NEXT: v_or_b32_e32 v16, v19, v18
+; GCN-NEXT: v_or_b32_e32 v15, v21, v20
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(6)
@@ -1711,63 +1796,74 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21
-; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16
-; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16
-; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16
-; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_or_b32_e32 v14, v14, v7
+; GCN-NEXT: v_or_b32_e32 v21, v19, v18
+; GCN-NEXT: v_or_b32_e32 v20, v24, v20
+; GCN-NEXT: v_or_b32_e32 v19, v23, v22
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:16
; GCN-NEXT: s_waitcnt vmcnt(7)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23
-; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16
+; GCN-NEXT: v_or_b32_e32 v18, v18, v7
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_or_b32_e32 v7, v7, v22
; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16
-; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GCN-NEXT: v_or_b32_e32 v25, v23, v22
+; GCN-NEXT: v_or_b32_e32 v24, v26, v24
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:12
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_or_b32_e32 v23, v22, v23
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_or_b32_e32 v22, v26, v22
; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112
; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96
-; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80
-; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:64
; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -1789,47 +1885,58 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_alignbit_b32 v36, v31, v32, 16
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34
+; GFX7-NEXT: v_or_b32_e32 v36, v32, v31
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v37
-; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v38
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16
-; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_or_b32_e32 v35, v34, v33
+; GFX7-NEXT: v_or_b32_e32 v34, v32, v31
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v48
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX7-NEXT: v_alignbit_b32 v33, v31, v32, 16
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_or_b32_e32 v33, v32, v31
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96
@@ -1844,19 +1951,22 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37
; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v36, 1.0, v48
+; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34
; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39
-; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; GFX7-NEXT: v_or_b32_e32 v36, v34, v33
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49
-; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48
-; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
-; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX7-NEXT: v_or_b32_e32 v35, v37, v35
+; GFX7-NEXT: v_or_b32_e32 v34, v34, v33
; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68
; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64
@@ -1867,26 +1977,32 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
-; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX7-NEXT: v_or_b32_e32 v33, v37, v33
; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
+; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v36, 1.0, v49
; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
-; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; GFX7-NEXT: v_or_b32_e32 v36, v34, v33
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
-; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
-; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
-; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX7-NEXT: v_or_b32_e32 v35, v37, v35
+; GFX7-NEXT: v_or_b32_e32 v34, v34, v33
; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
@@ -1897,83 +2013,100 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
-; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX7-NEXT: v_or_b32_e32 v33, v37, v33
; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
+; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v36, 1.0, v49
; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
-; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; GFX7-NEXT: v_or_b32_e32 v36, v34, v33
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
-; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
-; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
-; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX7-NEXT: v_or_b32_e32 v35, v37, v35
+; GFX7-NEXT: v_or_b32_e32 v34, v34, v33
; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
-; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX7-NEXT: v_or_b32_e32 v33, v37, v33
; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64
; GFX7-NEXT: s_nop 0
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v6, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
-; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
+; GFX7-NEXT: v_or_b32_e32 v13, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
-; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v11, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v10, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22
-; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v12, v7, v6
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21
-; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v9, v1, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16
-; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v8, v7, v6
+; GFX7-NEXT: v_or_b32_e32 v7, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v6, v1, v0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
-; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v17, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26
-; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16
-; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v16, v15, v14
+; GFX7-NEXT: v_or_b32_e32 v15, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24
-; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v14, v1, v0
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v33
; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16
@@ -2887,8 +3020,9 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
@@ -2900,10 +3034,11 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
@@ -2949,11 +3084,12 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -2963,9 +3099,10 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
; GFX7-NEXT: s_mov_b32 s7, 0xf000
@@ -3022,10 +3159,12 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16
-; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v1, v2, v3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v6
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
@@ -3037,14 +3176,16 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
@@ -3085,26 +3226,30 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_arg_store_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v3, v6, v7
+; GCN-NEXT: v_or_b32_e32 v2, v4, v5
+; GCN-NEXT: v_or_b32_e32 v1, v11, v10
+; GCN-NEXT: v_or_b32_e32 v0, v0, v12
+; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16
-; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
-; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16
-; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -3113,25 +3258,29 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_or_b32_e32 v5, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v0, v1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -3174,12 +3323,10 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
@@ -3187,24 +3334,34 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v2
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
-; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
-; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16
-; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16
-; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16
-; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16
-; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16
-; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16
+; GCN-NEXT: v_or_b32_e32 v3, v6, v7
+; GCN-NEXT: v_or_b32_e32 v2, v4, v5
+; GCN-NEXT: v_or_b32_e32 v1, v19, v18
+; GCN-NEXT: v_or_b32_e32 v0, v0, v20
+; GCN-NEXT: v_or_b32_e32 v7, v14, v15
+; GCN-NEXT: v_or_b32_e32 v6, v12, v13
+; GCN-NEXT: v_or_b32_e32 v5, v10, v11
+; GCN-NEXT: v_or_b32_e32 v4, v8, v9
; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -3214,41 +3371,49 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v5, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
-; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v14, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v13, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
-; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v12, v1, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v11, v1, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
-; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -4221,9 +4386,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
@@ -4256,9 +4422,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3
@@ -9156,13 +9323,13 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-LABEL: v_fadd_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
@@ -9173,9 +9340,9 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v2bf16:
@@ -9358,9 +9525,9 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v3bf16:
@@ -9598,10 +9765,10 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v4bf16:
@@ -9953,18 +10120,18 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v8bf16:
@@ -10638,22 +10805,22 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v16bf16:
@@ -11816,8 +11983,8 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_add_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_or_b32_sdwa v14, v31, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; GFX8-NEXT: v_add_f32_e32 v33, v33, v34
@@ -12080,36 +12247,36 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
-; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v30
+; GFX8-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v6, v23, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v24, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v9, v26, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v10, v27, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fadd_v32bf16:
@@ -13593,13 +13760,13 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-LABEL: v_fsub_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
@@ -13610,9 +13777,9 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_v2bf16:
@@ -13795,9 +13962,9 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_v3bf16:
@@ -14035,10 +14202,10 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fsub_v4bf16:
@@ -14355,13 +14522,13 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-LABEL: v_fmul_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
@@ -14372,9 +14539,9 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v2bf16:
@@ -14557,9 +14724,9 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v3bf16:
@@ -14797,10 +14964,10 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v4bf16:
@@ -15152,18 +15319,18 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v8bf16:
@@ -15837,22 +16004,22 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v16bf16:
@@ -17015,8 +17182,8 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_or_b32_sdwa v14, v31, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; GFX8-NEXT: v_mul_f32_e32 v33, v33, v34
@@ -17279,36 +17446,36 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
-; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v30
+; GFX8-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v6, v23, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v24, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v9, v26, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v10, v27, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmul_v32bf16:
@@ -19102,13 +19269,13 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-LABEL: v_minnum_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: v_min_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
@@ -19119,9 +19286,9 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v2bf16:
@@ -19304,9 +19471,9 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v3bf16:
@@ -19544,10 +19711,10 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v4bf16:
@@ -19899,18 +20066,18 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v8bf16:
@@ -20584,22 +20751,22 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v16bf16:
@@ -21762,8 +21929,8 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_min_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_or_b32_sdwa v14, v31, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; GFX8-NEXT: v_min_f32_e32 v33, v33, v34
@@ -22026,36 +22193,36 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
-; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v30
+; GFX8-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v6, v23, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v24, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v9, v26, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v10, v27, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minnum_v32bf16:
@@ -23356,13 +23523,13 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-LABEL: v_maxnum_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: v_max_f32_e32 v2, v3, v2
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
@@ -23373,9 +23540,9 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v2bf16:
@@ -23558,9 +23725,9 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v3bf16:
@@ -23798,10 +23965,10 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v4bf16:
@@ -24153,18 +24320,18 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v8bf16:
@@ -24838,22 +25005,22 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v16bf16:
@@ -26016,8 +26183,8 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX8-NEXT: v_max_f32_e32 v13, v13, v29
; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1
-; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT: v_or_b32_sdwa v14, v31, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30
; GFX8-NEXT: v_max_f32_e32 v33, v33, v34
@@ -26280,36 +26447,36 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
-; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16
-; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16
-; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16
-; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16
-; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16
-; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16
-; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16
-; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16
-; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16
-; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16
-; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v30
+; GFX8-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v6, v23, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v24, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v9, v26, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v10, v27, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maxnum_v32bf16:
@@ -32680,22 +32847,22 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
-; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
@@ -32823,6 +32990,7 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
@@ -32831,7 +32999,7 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
@@ -32841,9 +33009,9 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -33009,7 +33177,6 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
@@ -33018,28 +33185,29 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 1
; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
@@ -33297,22 +33465,22 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
@@ -33428,6 +33596,7 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
@@ -33436,7 +33605,7 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
@@ -33447,8 +33616,8 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -33613,7 +33782,7 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
@@ -33623,10 +33792,10 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
@@ -34003,36 +34172,37 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_xor_b32_e32 v5, v0, v1
-; GFX8-NEXT: v_ffbh_i32_e32 v4, v1
+; GFX8-NEXT: v_xor_b32_e32 v5, v2, v3
+; GFX8-NEXT: v_ffbh_i32_e32 v4, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5
; GFX8-NEXT: v_min_u32_e32 v4, v4, v5
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4
-; GFX8-NEXT: v_ldexp_f32 v4, v0, v1
-; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
-; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v0
-; GFX8-NEXT: v_ffbh_i32_e32 v0, v3
-; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT: v_min_u32_e32 v6, v0, v1
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX8-NEXT: v_xor_b32_e32 v6, v0, v1
+; GFX8-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT: v_ffbh_i32_e32 v5, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, -1, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 32, v6
+; GFX8-NEXT: v_min_u32_e32 v5, v5, v6
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX8-NEXT: v_ldexp_f32 v2, v2, v3
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
+; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v5
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
@@ -34040,8 +34210,7 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
@@ -34382,8 +34551,8 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
@@ -34789,16 +34958,16 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_xor_b32_e32 v8, v0, v1
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX8-NEXT: v_xor_b32_e32 v9, v0, v1
-; GFX8-NEXT: v_ffbh_i32_e32 v8, v1
-; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9
+; GFX8-NEXT: v_ffbh_i32_e32 v7, v1
+; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9
-; GFX8-NEXT: v_min_u32_e32 v8, v8, v9
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, -1, v7
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v8
+; GFX8-NEXT: v_min_u32_e32 v7, v7, v8
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10
; GFX8-NEXT: v_ldexp_f32 v4, v4, v6
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
@@ -34807,10 +34976,10 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v7
; GFX8-NEXT: v_ldexp_f32 v6, v0, v1
; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6
@@ -34836,10 +35005,10 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
@@ -35288,8 +35457,8 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
@@ -35301,9 +35470,9 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -35452,9 +35621,9 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -35623,7 +35792,6 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
@@ -35632,28 +35800,29 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX8-NEXT: v_bfe_u32 v3, v5, 16, 1
; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
@@ -35916,22 +36085,22 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -36047,6 +36216,7 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
@@ -36055,7 +36225,7 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
@@ -36066,8 +36236,8 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -36232,7 +36402,7 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
@@ -36242,10 +36412,10 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
@@ -36574,27 +36744,28 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_ffbh_u32_e32 v4, v1
+; GFX8-NEXT: v_ffbh_u32_e32 v4, v3
; GFX8-NEXT: v_min_u32_e32 v4, 32, v4
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4
-; GFX8-NEXT: v_ldexp_f32 v4, v0, v1
-; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v0
-; GFX8-NEXT: v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT: v_min_u32_e32 v6, 32, v0
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX8-NEXT: v_ffbh_u32_e32 v5, v1
+; GFX8-NEXT: v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT: v_min_u32_e32 v5, 32, v5
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX8-NEXT: v_ldexp_f32 v2, v2, v3
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
+; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v5
; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
@@ -36602,8 +36773,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
@@ -36870,8 +37040,8 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -37188,14 +37358,14 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX8-NEXT: v_ffbh_u32_e32 v4, v7
; GFX8-NEXT: v_min_u32_e32 v10, 32, v4
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX8-NEXT: v_ffbh_u32_e32 v7, v1
; GFX8-NEXT: v_min_u32_e32 v4, 1, v4
; GFX8-NEXT: v_or_b32_e32 v4, v5, v4
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT: v_min_u32_e32 v7, 32, v7
+; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX8-NEXT: v_ffbh_u32_e32 v8, v1
-; GFX8-NEXT: v_min_u32_e32 v8, 32, v8
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10
; GFX8-NEXT: v_ldexp_f32 v4, v4, v6
@@ -37205,10 +37375,10 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v7
; GFX8-NEXT: v_ldexp_f32 v6, v0, v1
; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6
@@ -37230,10 +37400,10 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
@@ -38231,49 +38401,53 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
; GCN-LABEL: v_select_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
+; GCN-NEXT: v_or_b32_e32 v1, v1, v2
+; GCN-NEXT: v_or_b32_e32 v2, v4, v5
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_v3bf16:
@@ -38328,14 +38502,18 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT: v_alignbit_b32 v2, v6, v5, 16
-; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT: v_alignbit_b32 v4, v8, v7, 16
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_or_b32_e32 v1, v1, v2
+; GCN-NEXT: v_or_b32_e32 v2, v5, v6
+; GCN-NEXT: v_or_b32_e32 v3, v3, v4
+; GCN-NEXT: v_or_b32_e32 v4, v7, v8
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -38349,22 +38527,26 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -38430,18 +38612,24 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT: v_alignbit_b32 v2, v8, v7, 16
-; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT: v_alignbit_b32 v4, v10, v9, 16
-; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT: v_alignbit_b32 v6, v12, v11, 16
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_or_b32_e32 v1, v1, v2
+; GCN-NEXT: v_or_b32_e32 v2, v7, v8
+; GCN-NEXT: v_or_b32_e32 v3, v3, v4
+; GCN-NEXT: v_or_b32_e32 v4, v9, v10
+; GCN-NEXT: v_or_b32_e32 v5, v5, v6
+; GCN-NEXT: v_or_b32_e32 v6, v11, v12
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
@@ -38458,30 +38646,36 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v9
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v7, v4
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
@@ -38558,22 +38752,30 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT: v_alignbit_b32 v2, v10, v9, 16
-; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT: v_alignbit_b32 v4, v12, v11, 16
-; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT: v_alignbit_b32 v6, v14, v13, 16
-; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; GCN-NEXT: v_alignbit_b32 v8, v16, v15, 16
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_or_b32_e32 v1, v1, v2
+; GCN-NEXT: v_or_b32_e32 v2, v9, v10
+; GCN-NEXT: v_or_b32_e32 v3, v3, v4
+; GCN-NEXT: v_or_b32_e32 v4, v11, v12
+; GCN-NEXT: v_or_b32_e32 v5, v5, v6
+; GCN-NEXT: v_or_b32_e32 v6, v13, v14
+; GCN-NEXT: v_or_b32_e32 v7, v7, v8
+; GCN-NEXT: v_or_b32_e32 v8, v15, v16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
@@ -38593,38 +38795,46 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v9, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v12
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v9, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v11
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v9, v4
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v9, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v13
-; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v9, v6
+; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v9, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
+; GFX7-NEXT: v_or_b32_e32 v8, v9, v8
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
@@ -38692,16 +38902,19 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_or_b32_e32 v1, v1, v2
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_or_b32_e32 v2, v17, v2
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v3, v3, v4
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
@@ -38727,32 +38940,44 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16
-; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_or_b32_e32 v4, v17, v4
+; GCN-NEXT: v_or_b32_e32 v5, v5, v6
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
-; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16
-; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16
-; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16
-; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16
-; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16
-; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_or_b32_e32 v18, v19, v18
+; GCN-NEXT: v_or_b32_e32 v7, v7, v8
+; GCN-NEXT: v_or_b32_e32 v8, v21, v20
+; GCN-NEXT: v_or_b32_e32 v9, v9, v10
+; GCN-NEXT: v_or_b32_e32 v10, v23, v22
+; GCN-NEXT: v_or_b32_e32 v11, v11, v12
+; GCN-NEXT: v_or_b32_e32 v12, v25, v24
+; GCN-NEXT: v_or_b32_e32 v13, v13, v14
+; GCN-NEXT: v_or_b32_e32 v14, v27, v26
+; GCN-NEXT: v_or_b32_e32 v15, v15, v16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
@@ -38779,8 +39004,9 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_or_b32_e32 v14, v16, v14
; GCN-NEXT: v_cndmask_b32_e32 v15, v14, v15, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -38790,68 +39016,83 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v17, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v17, v4
; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21
-; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v18, v19, v18
+; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23
-; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_or_b32_e32 v8, v19, v8
+; GFX7-NEXT: v_or_b32_e32 v9, v9, v10
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25
-; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_or_b32_e32 v10, v19, v10
+; GFX7-NEXT: v_or_b32_e32 v11, v11, v12
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_or_b32_e32 v12, v19, v12
+; GFX7-NEXT: v_or_b32_e32 v13, v13, v14
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30
-; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29
+; GFX7-NEXT: v_or_b32_e32 v15, v15, v16
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16
+; GFX7-NEXT: v_or_b32_e32 v14, v19, v14
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
@@ -38874,10 +39115,11 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v16
; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -38953,200 +39195,232 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_or_b32_e32 v1, v2, v1
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v2, v3, v2
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_or_b32_e32 v3, v4, v3
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_or_b32_e32 v4, v5, v4
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_or_b32_e32 v5, v6, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_or_b32_e32 v6, v7, v6
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_or_b32_e32 v7, v8, v7
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v17
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_or_b32_e32 v8, v9, v8
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v20
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v19
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_or_b32_e32 v9, v10, v9
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v22
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v21
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_or_b32_e32 v10, v11, v10
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v24
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_or_b32_e32 v11, v12, v11
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v26
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v12, v13, v12
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v28
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v27
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_or_b32_e32 v13, v14, v13
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v30
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_alignbit_b32 v14, v14, v20, 16
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_or_b32_e32 v14, v20, v14
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:24
; GCN-NEXT: s_waitcnt vmcnt(5)
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_or_b32_e32 v15, v16, v15
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_or_b32_e32 v16, v17, v16
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32
; GCN-NEXT: s_waitcnt vmcnt(3)
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_or_b32_e32 v17, v19, v17
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40
-; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GCN-NEXT: v_alignbit_b32 v18, v20, v18, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_or_b32_e32 v18, v18, v20
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_or_b32_e32 v19, v20, v19
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56
-; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_or_b32_e32 v20, v21, v20
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64
-; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_or_b32_e32 v21, v22, v21
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_or_b32_e32 v22, v23, v22
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GCN-NEXT: v_alignbit_b32 v23, v23, v24, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_or_b32_e32 v23, v24, v23
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88
-; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_or_b32_e32 v24, v25, v24
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96
-; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GCN-NEXT: v_alignbit_b32 v25, v25, v26, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_or_b32_e32 v25, v26, v25
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104
-; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_or_b32_e32 v26, v27, v26
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112
-; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GCN-NEXT: v_alignbit_b32 v27, v27, v28, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_or_b32_e32 v27, v28, v27
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_or_b32_e32 v28, v29, v28
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GCN-NEXT: v_alignbit_b32 v29, v29, v30, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_or_b32_e32 v29, v30, v29
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_or_b32_e32 v30, v31, v30
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
-; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_or_b32_e32 v31, v32, v31
; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc
; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc
; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc
@@ -39201,242 +39475,273 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16
-; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76
-; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_or_b32_e32 v9, v9, v10
+; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_or_b32_e32 v19, v19, v20
+; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84
+; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_or_b32_e32 v13, v13, v14
+; GFX7-NEXT: v_or_b32_e32 v27, v27, v28
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_or_b32_e32 v11, v11, v12
+; GFX7-NEXT: v_or_b32_e32 v23, v23, v24
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_or_b32_e32 v15, v15, v16
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_or_b32_e32 v17, v17, v18
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_or_b32_e32 v21, v21, v22
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_or_b32_e32 v25, v25, v26
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_or_b32_e32 v29, v29, v30
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100
; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68
-; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84
+; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76
; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92
; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124
; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(13)
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: s_waitcnt vmcnt(12)
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v5, v6, v5
+; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(12)
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; GFX7-NEXT: s_waitcnt vmcnt(11)
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: s_waitcnt vmcnt(9)
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(5)
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v8, v7
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v6, v2, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_or_b32_e32 v8, v31, v8
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_cndmask_b32_e32 v8, v8, v4, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: v_alignbit_b32 v10, v10, v31, 16
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v10, v31, v10
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v12, v31, v12
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v14, v31, v14
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v16, v31, v16
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v18, v31, v18
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v20, v31, v20
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v22, v31, v22
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v24, v31, v24
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v26, v31, v26
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v28, v31, v28
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v30, v31, v30
; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_or_b32_e32 v31, v32, v31
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_or_b32_e32 v32, v33, v32
; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
@@ -39556,12 +39861,14 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3
; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2
; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
+; GCN-NEXT: v_or_b32_e32 v1, v2, v1
+; GCN-NEXT: v_or_b32_e32 v2, v4, v3
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -39572,13 +39879,15 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
; GFX7-LABEL: s_select_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -39667,14 +39976,18 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2
; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7
; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
-; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16
-; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_or_b32_e32 v1, v2, v1
+; GCN-NEXT: v_or_b32_e32 v2, v4, v3
+; GCN-NEXT: v_or_b32_e32 v3, v6, v5
+; GCN-NEXT: v_or_b32_e32 v4, v8, v7
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -39685,21 +39998,25 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
; GFX7-LABEL: s_select_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7
; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -42888,15 +43205,15 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX8-LABEL: v_fma_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
@@ -42907,9 +43224,9 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_v2bf16:
@@ -43113,9 +43430,9 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_v3bf16:
@@ -43388,10 +43705,10 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fma_v4bf16:
@@ -43798,8 +44115,8 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX8-LABEL: v_fmuladd_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
@@ -43808,13 +44125,13 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
@@ -43827,16 +44144,16 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_v2bf16:
@@ -44140,9 +44457,9 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_v3bf16:
@@ -44554,10 +44871,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmuladd_v4bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index 4787f21e28598..ac4eb9ca77a86 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -604,15 +604,17 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
; SI-LABEL: v_bswap_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8
-; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
-; SI-NEXT: s_mov_b32 s4, 0xff00ff
-; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8
+; SI-NEXT: v_alignbit_b32 v2, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
-; SI-NEXT: v_bfi_b32 v0, s4, v0, v2
-; SI-NEXT: v_bfi_b32 v1, s4, v1, v3
+; SI-NEXT: s_mov_b32 s4, 0xff00ff
+; SI-NEXT: v_alignbit_b32 v3, v0, v0, 8
+; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
+; SI-NEXT: v_bfi_b32 v1, s4, v1, v2
+; SI-NEXT: v_bfi_b32 v0, s4, v0, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bswap_v2i16:
@@ -635,20 +637,21 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
; SI-LABEL: v_bswap_v3i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_alignbit_b32 v3, v0, v0, 8
-; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
-; SI-NEXT: s_mov_b32 s4, 0xff00ff
-; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8
+; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
+; SI-NEXT: s_mov_b32 s4, 0xff00ff
+; SI-NEXT: v_alignbit_b32 v4, v0, v0, 8
+; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8
; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24
-; SI-NEXT: v_bfi_b32 v0, s4, v0, v3
-; SI-NEXT: v_bfi_b32 v1, s4, v1, v4
+; SI-NEXT: v_bfi_b32 v1, s4, v1, v3
+; SI-NEXT: v_bfi_b32 v0, s4, v0, v4
; SI-NEXT: v_bfi_b32 v2, s4, v2, v5
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; SI-NEXT: v_or_b32_e32 v0, v0, v3
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bswap_v3i16:
@@ -673,25 +676,27 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
; SI-LABEL: v_bswap_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8
-; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24
+; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8
+; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: s_mov_b32 s4, 0xff00ff
-; SI-NEXT: v_alignbit_b32 v5, v3, v3, 8
-; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
-; SI-NEXT: v_alignbit_b32 v6, v0, v0, 8
+; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8
; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
-; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8
-; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
-; SI-NEXT: v_bfi_b32 v2, s4, v2, v4
-; SI-NEXT: v_bfi_b32 v3, s4, v3, v5
-; SI-NEXT: v_bfi_b32 v0, s4, v0, v6
-; SI-NEXT: v_bfi_b32 v1, s4, v1, v7
+; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8
+; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
+; SI-NEXT: v_alignbit_b32 v7, v2, v2, 8
+; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24
+; SI-NEXT: v_bfi_b32 v1, s4, v1, v4
+; SI-NEXT: v_bfi_b32 v0, s4, v0, v5
+; SI-NEXT: v_bfi_b32 v3, s4, v3, v6
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v7
+; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v0, v0, v4
+; SI-NEXT: v_or_b32_e32 v2, v2, v5
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bswap_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index f4b432dce8c8a..7f03db19c1668 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -9347,31 +9347,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
@@ -9402,16 +9402,17 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: v_mov_b32_e32 v5, v0
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -9444,17 +9445,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v1
; GFX6-NEXT: v_mov_b32_e32 v5, v0
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -9773,30 +9775,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -9828,16 +9830,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
-; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v3, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
@@ -9870,17 +9873,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
-; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v3, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
@@ -10437,13 +10441,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB28_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX8-NEXT: v_add_f32_e32 v4, v4, v8
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
@@ -10451,16 +10455,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v5, v5, v9
; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_mov_b32_e32 v5, v6
@@ -10524,15 +10528,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB28_4 Depth 2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX7-NEXT: v_add_f32_e32 v6, v6, v9
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v4, v7, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
; GFX7-NEXT: v_mov_b32_e32 v6, v4
@@ -10598,15 +10603,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB28_4 Depth 2
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_add_f32_e32 v6, v6, v9
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v4, v7, v10
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v4
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
; GFX6-NEXT: v_mov_b32_e32 v6, v4
@@ -10954,31 +10960,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
@@ -11009,16 +11015,17 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: v_mov_b32_e32 v5, v0
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -11051,17 +11058,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v1
; GFX6-NEXT: v_mov_b32_e32 v5, v0
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -11380,30 +11388,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -11435,16 +11443,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
-; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v3, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
@@ -11477,17 +11486,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
-; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v3, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
@@ -11817,31 +11827,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
@@ -11872,16 +11882,17 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: v_mov_b32_e32 v5, v0
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -11914,17 +11925,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v1
; GFX6-NEXT: v_mov_b32_e32 v5, v0
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -12243,30 +12255,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -12298,16 +12310,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
-; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v3, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
@@ -12340,17 +12353,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
-; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v3, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
@@ -12669,30 +12683,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -12724,16 +12738,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
-; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v3, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
@@ -12766,17 +12781,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
-; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v3, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 6f1675edbe58a..c5d8267f9ec35 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -7797,31 +7797,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
@@ -7841,27 +7841,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_max_f32_e32 v6, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: v_mov_b32_e32 v5, v0
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -7883,28 +7884,29 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX6-NEXT: v_mov_b32_e32 v2, s20
; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_max_f32_e32 v6, v6, v3
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_max_f32_e32 v6, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v1
; GFX6-NEXT: v_mov_b32_e32 v5, v0
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -8313,30 +8315,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
; GFX8-NEXT: v_max_f32_e32 v5, v5, v3
; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -8357,36 +8359,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_max_f32_e32 v5, v5, v0
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v1
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16
+; GFX7-NEXT: v_max_f32_e32 v6, v3, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB20_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8399,14 +8402,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX6-NEXT: v_mov_b32_e32 v2, s20
; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8414,22 +8417,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_max_f32_e32 v5, v5, v0
-; GFX6-NEXT: v_max_f32_e32 v6, v6, v1
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16
+; GFX6-NEXT: v_max_f32_e32 v6, v3, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB20_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9127,13 +9131,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX8-NEXT: v_max_f32_e32 v4, v4, v8
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
@@ -9141,16 +9145,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_max_f32_e32 v5, v5, v9
; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_mov_b32_e32 v5, v6
@@ -9202,27 +9206,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v9
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v9
+; GFX7-NEXT: v_max_f32_e32 v4, v7, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v10
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
; GFX7-NEXT: v_mov_b32_e32 v6, v4
@@ -9276,27 +9281,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX6-NEXT: s_cbranch_execnz .LBB21_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7
-; GFX6-NEXT: v_max_f32_e32 v4, v4, v9
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_max_f32_e32 v6, v6, v9
+; GFX6-NEXT: v_max_f32_e32 v4, v7, v10
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_max_f32_e32 v7, v7, v10
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v4
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
; GFX6-NEXT: v_mov_b32_e32 v6, v4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index acb27be1846b9..7dc5de97179f4 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -7797,31 +7797,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v0, v5
; GFX8-NEXT: v_mov_b32_e32 v1, v6
; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
@@ -7841,27 +7841,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_min_f32_e32 v6, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v1
; GFX7-NEXT: v_mov_b32_e32 v5, v0
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -7883,28 +7884,29 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX6-NEXT: v_mov_b32_e32 v2, s20
; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX6-NEXT: v_mov_b32_e32 v4, s6
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_min_f32_e32 v6, v6, v3
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_min_f32_e32 v6, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v1
; GFX6-NEXT: v_mov_b32_e32 v5, v0
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -8313,30 +8315,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
; GFX8-NEXT: s_add_i32 s4, s20, 0x400
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
; GFX8-NEXT: v_min_f32_e32 v5, v5, v3
; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v6, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v0
; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
@@ -8357,36 +8359,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX7-NEXT: s_add_i32 s6, s20, 0x400
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_min_f32_e32 v5, v5, v0
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v1
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16
+; GFX7-NEXT: v_min_f32_e32 v6, v3, v1
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB20_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8399,14 +8402,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX6-NEXT: v_mov_b32_e32 v2, s20
; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
; GFX6-NEXT: s_add_i32 s6, s20, 0x400
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s6
; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8414,22 +8417,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_min_f32_e32 v5, v5, v0
-; GFX6-NEXT: v_min_f32_e32 v6, v6, v1
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16
+; GFX6-NEXT: v_min_f32_e32 v6, v3, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB20_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9127,13 +9131,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: ; %bb.2:
; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v5
; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX8-NEXT: v_min_f32_e32 v4, v4, v8
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
@@ -9141,16 +9145,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_min_f32_e32 v5, v5, v9
; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v4, v5
; GFX8-NEXT: s_mov_b64 s[12:13], exec
; GFX8-NEXT: v_mov_b32_e32 v5, v6
@@ -9202,27 +9206,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v9
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v9
+; GFX7-NEXT: v_min_f32_e32 v4, v7, v10
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v10
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
; GFX7-NEXT: v_mov_b32_e32 v6, v4
@@ -9276,27 +9281,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX6-NEXT: s_cbranch_execnz .LBB21_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7
-; GFX6-NEXT: v_min_f32_e32 v4, v4, v9
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_min_f32_e32 v6, v6, v9
+; GFX6-NEXT: v_min_f32_e32 v4, v7, v10
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_min_f32_e32 v7, v7, v10
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v4
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
; GFX6-NEXT: v_mov_b32_e32 v6, v4
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index 5c7172ff8d047..79d5251361f58 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -162,8 +162,8 @@ define void @undef_lo2_v4i16(<2 x i16> %arg0) {
; GFX8-LABEL: undef_lo2_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v[0:1]
; GFX8-NEXT: ;;#ASMEND
@@ -187,8 +187,8 @@ define void @undef_lo2_v4f16(<2 x half> %arg0) {
; GFX8-LABEL: undef_lo2_v4f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v[0:1]
; GFX8-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 7208eaeff8eb1..3976151db89e6 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -189,7 +189,9 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_alignbit_b32 v0, 5, s6, 16
+; GFX6-NEXT: s_lshr_b32 s4, s6, 16
+; GFX6-NEXT: s_or_b32 s4, s4, 0x50000
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 86e890b06989a..4c4b7a2d9a969 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1461,30 +1461,30 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3
-; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2
+; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:3
+; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:2
; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
-; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
+; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[12:15], 0 addr64 offset:2
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4
-; SI-NEXT: v_or_b32_e32 v5, v5, v4
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
+; SI-NEXT: v_or_b32_e32 v6, v3, v2
+; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_or_b32_e32 v6, v3, v6
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
-; SI-NEXT: v_mov_b32_e32 v3, v1
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v5
; SI-NEXT: v_or_b32_e32 v4, v4, v6
+; SI-NEXT: v_or_b32_e32 v5, v5, v6
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0
; SI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 85e56a243cdc9..acab8c6b44e77 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -480,13 +480,14 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshr_b32 s2, s2, 16
+; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000
+; GCN-NEXT: s_or_b32 s2, s2, s3
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_lshr_b32 s0, s3, 16
; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 16
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
@@ -534,8 +535,9 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
; GCN-LABEL: divergent_vec_i16_HH:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: divergent_vec_i16_HH:
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
index d8f81db70e309..5d1023fc9249d 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
@@ -220,16 +220,18 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s4, s3, 0xffff0000
; CI-NEXT: s_lshl_b32 s3, s3, 16
-; CI-NEXT: s_and_b32 s5, s2, 0xffff0000
; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4|
; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s3|
-; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s5|
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: s_and_b32 s5, s2, 0xffff0000
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_alignbit_b32 v1, v0, v1, 16
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; CI-NEXT: v_or_b32_e32 v1, v1, v0
+; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s5|
; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2|
-; CI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_or_b32_e32 v0, v2, v0
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -537,16 +539,17 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; CI-NEXT: v_mul_f32_e64 v4, 1.0, |v3|
; CI-NEXT: v_mul_f32_e64 v5, 1.0, |v2|
-; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; CI-NEXT: v_mul_f32_e32 v3, v4, v3
+; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; CI-NEXT: v_mul_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v4, v3
+; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -585,8 +588,8 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -716,18 +719,19 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v2, v[0:1]
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_and_b32 s1, s4, 0xffff0000
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_lshl_b32 s0, s4, 16
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_and_b32 s0, s4, 0xffff0000
+; CI-NEXT: s_lshl_b32 s1, s4, 16
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v3, 0x7fff0000, v2
-; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_mul_f32_e32 v3, s1, v3
+; CI-NEXT: v_and_b32_e32 v3, 0x7fff, v2
+; CI-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_mul_f32_e32 v2, s0, v2
+; CI-NEXT: v_mul_f32_e32 v3, s1, v3
+; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -767,8 +771,8 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 450d66767600b..0288524db268b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -3975,20 +3975,20 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma
; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4116,42 +4116,42 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> %m
; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f64_e32 v7, v[0:1]
-; GFX8-NEXT: v_cvt_f32_f64_e32 v8, v[2:3]
+; GFX8-NEXT: v_cvt_f32_f64_e32 v7, v[2:3]
+; GFX8-NEXT: v_cvt_f32_f64_e32 v8, v[0:1]
; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v7
; GFX8-NEXT: v_and_b32_e32 v9, 1, v7
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9
-; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[5:6]|
-; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[5:6]
+; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[5:6]|
+; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[5:6]
; GFX8-NEXT: v_cndmask_b32_e64 v5, -1, 1, s[6:7]
; GFX8-NEXT: v_add_u32_e64 v5, s[6:7], v7, v5
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc
; GFX8-NEXT: v_bfe_u32 v5, v7, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v5, v7
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v8
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[0:1]
-; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[5:6]|
-; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[5:6]
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v5
+; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v8
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v9, v7, s[4:5]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v1, -1, 1, s[6:7]
-; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], v8, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 1, v8
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v3
+; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[5:6]|
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc
+; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[5:6]
+; GFX8-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v3, -1, 1, s[6:7]
+; GFX8-NEXT: v_add_u32_e64 v3, s[6:7], v8, v3
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
-; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
-; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3
; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -4371,18 +4371,18 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> %m
; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4850,26 +4850,27 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> in
;
; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x10010
-; GFX8-NEXT: s_add_i32 s4, s4, s0
-; GFX8-NEXT: s_or_b32 s3, s0, 0x400000
+; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010
+; GFX8-NEXT: s_add_i32 s4, s4, s1
+; GFX8-NEXT: s_or_b32 s3, s1, 0x400000
; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s0, s0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1
; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX8-NEXT: s_cselect_b32 s3, s3, s6
-; GFX8-NEXT: s_bfe_u32 s0, s1, 0x10010
-; GFX8-NEXT: s_add_i32 s0, s0, s1
-; GFX8-NEXT: s_or_b32 s4, s1, 0x400000
-; GFX8-NEXT: s_add_i32 s5, s0, 0x7fff
-; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], s1, s1
+; GFX8-NEXT: s_cselect_b32 s1, s3, s6
+; GFX8-NEXT: s_and_b32 s3, s1, 0x7fff0000
+; GFX8-NEXT: s_bfe_u32 s1, s0, 0x10010
+; GFX8-NEXT: s_add_i32 s1, s1, s0
+; GFX8-NEXT: s_or_b32 s4, s0, 0x400000
+; GFX8-NEXT: s_add_i32 s5, s1, 0x7fff
+; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0
; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
; GFX8-NEXT: s_cselect_b32 s0, s4, s5
; GFX8-NEXT: s_lshr_b32 s0, s0, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16
-; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff
+; GFX8-NEXT: s_or_b32 s0, s0, s3
+; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
@@ -4995,52 +4996,53 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> i
;
; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_cvt_f32_f64_e32 v2, s[0:1]
-; GFX8-NEXT: v_cvt_f32_f64_e32 v3, s[2:3]
-; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], s[0:1], s[0:1]
+; GFX8-NEXT: v_cvt_f32_f64_e32 v2, s[2:3]
+; GFX8-NEXT: v_cvt_f32_f64_e32 v3, s[0:1]
+; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], s[2:3], s[2:3]
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
; GFX8-NEXT: v_readfirstlane_b32 s5, v2
; GFX8-NEXT: s_bitcmp1_b32 s5, 0
; GFX8-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_cmp_gt_f64_e64 s[8:9], |s[0:1]|, |v[0:1]|
+; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, s[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |s[2:3]|, |v[0:1]|
; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
-; GFX8-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[2:3], v[0:1]
-; GFX8-NEXT: v_cmp_gt_f64_e64 s[12:13], |s[2:3]|, |v[0:1]|
+; GFX8-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[0:1], v[0:1]
; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX8-NEXT: s_cselect_b32 s6, 1, -1
+; GFX8-NEXT: s_add_i32 s12, s5, s6
+; GFX8-NEXT: s_and_b64 s[6:7], s[10:11], exec
+; GFX8-NEXT: s_cselect_b32 s5, s5, s12
+; GFX8-NEXT: s_bfe_u32 s6, s5, 0x10010
+; GFX8-NEXT: s_or_b32 s10, s5, 0x400000
+; GFX8-NEXT: s_add_i32 s5, s6, s5
+; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |s[0:1]|, |v[0:1]|
+; GFX8-NEXT: s_addk_i32 s5, 0x7fff
; GFX8-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX8-NEXT: s_cselect_b32 s8, 1, -1
-; GFX8-NEXT: s_add_i32 s14, s5, s8
-; GFX8-NEXT: s_and_b64 s[8:9], s[10:11], exec
-; GFX8-NEXT: s_cselect_b32 s5, s5, s14
-; GFX8-NEXT: s_bfe_u32 s8, s5, 0x10010
-; GFX8-NEXT: s_add_i32 s8, s8, s5
-; GFX8-NEXT: s_addk_i32 s8, 0x7fff
-; GFX8-NEXT: s_bitset1_b32 s5, 22
+; GFX8-NEXT: s_cselect_b32 s5, s10, s5
+; GFX8-NEXT: s_and_b32 s5, s5, 0x7fff0000
+; GFX8-NEXT: v_readfirstlane_b32 s10, v3
+; GFX8-NEXT: s_bitcmp1_b32 s10, 0
+; GFX8-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec
-; GFX8-NEXT: s_cselect_b32 s5, s5, s8
-; GFX8-NEXT: v_readfirstlane_b32 s8, v3
-; GFX8-NEXT: s_bitcmp1_b32 s8, 0
-; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_and_b64 s[6:7], s[12:13], exec
-; GFX8-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], s[2:3]
+; GFX8-NEXT: v_cmp_u_f64_e64 s[0:1], s[0:1], s[0:1]
; GFX8-NEXT: s_cselect_b32 s6, 1, -1
-; GFX8-NEXT: s_add_i32 s6, s8, s6
+; GFX8-NEXT: s_add_i32 s6, s10, s6
+; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX8-NEXT: s_cselect_b32 s2, s10, s6
+; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010
+; GFX8-NEXT: s_add_i32 s3, s3, s2
+; GFX8-NEXT: s_addk_i32 s3, 0x7fff
+; GFX8-NEXT: s_bitset1_b32 s2, 22
; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; GFX8-NEXT: s_cselect_b32 s0, s8, s6
-; GFX8-NEXT: s_bfe_u32 s1, s0, 0x10010
-; GFX8-NEXT: s_add_i32 s1, s1, s0
-; GFX8-NEXT: s_add_i32 s6, s1, 0x7fff
-; GFX8-NEXT: s_or_b32 s7, s0, 0x400000
-; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], exec
-; GFX8-NEXT: s_cselect_b32 s0, s7, s6
+; GFX8-NEXT: s_cselect_b32 s0, s2, s3
; GFX8-NEXT: s_lshr_b32 s0, s0, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16
-; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff
+; GFX8-NEXT: s_or_b32 s0, s0, s5
+; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s4
-; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1
+; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
@@ -5249,24 +5251,25 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i
;
; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_bfe_u32 s3, s1, 0x10010
-; GFX8-NEXT: s_add_i32 s3, s3, s1
-; GFX8-NEXT: s_addk_i32 s3, 0x7fff
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1
-; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX8-NEXT: s_cselect_b32 s1, s1, s3
; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010
; GFX8-NEXT: s_add_i32 s3, s3, s2
; GFX8-NEXT: s_addk_i32 s3, 0x7fff
; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s2, s2
; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX8-NEXT: s_cselect_b32 s2, s2, s3
-; GFX8-NEXT: s_lshr_b32 s2, s2, 16
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s2, v0, 16
-; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0
+; GFX8-NEXT: s_and_b32 s4, s2, 0x80000000
+; GFX8-NEXT: s_bfe_u32 s2, s1, 0x10010
+; GFX8-NEXT: s_add_i32 s2, s2, s1
+; GFX8-NEXT: s_add_i32 s5, s2, 0x7fff
+; GFX8-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1
+; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX8-NEXT: s_cselect_b32 s1, s1, s5
+; GFX8-NEXT: s_lshr_b32 s1, s1, 16
+; GFX8-NEXT: s_or_b32 s1, s1, s4
+; GFX8-NEXT: s_mov_b32 s2, 0x7fff7fff
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
@@ -5779,9 +5782,9 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16(<3 x float> %ma
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v3
; GFX8-NEXT: v_bfi_b32 v1, s4, v2, v4
@@ -6004,8 +6007,8 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16(<3 x double> %m
; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1
; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v6
; GFX8-NEXT: v_bfi_b32 v1, s4, v10, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -6315,8 +6318,9 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
@@ -6324,9 +6328,9 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v4
@@ -6991,11 +6995,11 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16(<4 x float> %ma
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v3
; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -7229,19 +7233,18 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16(<4 x double> %m
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v10, v4, v13, vcc
; GFX8-NEXT: v_bfe_u32 v4, v10, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, s8, v4
-; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v4, v10
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v11
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s8, v13
+; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]|
+; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10
; GFX8-NEXT: v_and_b32_e32 v7, 1, v11
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v13, v10, s[4:5]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7
-; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]|
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
-; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
-; GFX8-NEXT: v_cvt_f32_f64_e32 v10, v[2:3]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; GFX8-NEXT: v_cvt_f32_f64_e32 v10, v[2:3]
; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], v11, v4
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v11, vcc
@@ -7267,9 +7270,10 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16(<4 x double> %m
; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1
; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT: v_alignbit_b32 v1, v6, v12, 16
+; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v6
+; GFX8-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v8
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v9
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -7675,12 +7679,12 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32(<4 x bfloat> %m
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v3
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2
-; GFX8-NEXT: v_alignbit_b32 v2, v5, v4, 16
+; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v5
+; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 8c7d5cffe39d9..97a52b434cb9b 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -18886,30 +18886,30 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18936,16 +18936,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -19215,30 +19216,30 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19266,16 +19267,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -19562,30 +19564,30 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19613,16 +19615,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -19881,29 +19884,29 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19930,16 +19933,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -20201,29 +20205,29 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -20252,16 +20256,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -20545,29 +20550,29 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -20596,16 +20601,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -20877,30 +20883,30 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -20928,16 +20934,17 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -21203,29 +21210,29 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -21254,16 +21261,17 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -21529,30 +21537,30 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -21579,16 +21587,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -21848,29 +21857,29 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -21897,16 +21906,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -22171,30 +22181,30 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -22221,16 +22231,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -22490,29 +22501,29 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -22539,16 +22550,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 56ad91dd59ffb..4cac44b9fd21d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -16534,30 +16534,30 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_max_f32_e32 v3, v3, v4
; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -16574,26 +16574,27 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_max_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -16988,30 +16989,30 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17029,8 +17030,8 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
@@ -17039,16 +17040,17 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -17464,30 +17466,30 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17505,8 +17507,8 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
@@ -17515,16 +17517,17 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -17904,29 +17907,29 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17942,34 +17945,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB57_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18345,29 +18349,29 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18385,34 +18389,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB58_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18813,29 +18818,29 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18853,34 +18858,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB59_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19271,30 +19277,30 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19312,8 +19318,8 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
@@ -19322,16 +19328,17 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -19719,29 +19726,29 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19759,34 +19766,35 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB61_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index f0083bd23660a..85194c53082a0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -16534,30 +16534,30 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_min_f32_e32 v3, v3, v4
; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -16574,26 +16574,27 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_min_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -16988,30 +16989,30 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17029,8 +17030,8 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
@@ -17039,16 +17040,17 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -17464,30 +17466,30 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17505,8 +17507,8 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
@@ -17515,16 +17517,17 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -17904,29 +17907,29 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17942,34 +17945,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB57_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18345,29 +18349,29 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18385,34 +18389,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB58_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18813,29 +18818,29 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18853,34 +18858,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB59_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19271,30 +19277,30 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19312,8 +19318,8 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
@@ -19322,16 +19328,17 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -19719,29 +19726,29 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19759,34 +19766,35 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB61_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 3ee0bb2122abe..64320ceab2f0c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -15963,30 +15963,30 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -16013,16 +16013,17 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -16417,30 +16418,30 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -16468,16 +16469,17 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -16893,30 +16895,30 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -16944,16 +16946,17 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -17333,29 +17336,29 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17382,16 +17385,17 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -17774,29 +17778,29 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17825,16 +17829,17 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -18242,29 +18247,29 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18293,16 +18298,17 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -18700,30 +18706,30 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18751,16 +18757,17 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
@@ -19148,29 +19155,29 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19199,16 +19206,17 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 64a9727330cfd..3320a21097a3c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -435,19 +435,22 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
; CI-NEXT: s_lshl_b32 s2, s2, 16
; CI-NEXT: v_add_f32_e64 v0, s3, 2.0
; CI-NEXT: v_add_f32_e64 v1, s2, 1.0
-; CI-NEXT: v_readfirstlane_b32 s2, v0
+; CI-NEXT: v_readfirstlane_b32 s2, v1
+; CI-NEXT: v_readfirstlane_b32 s3, v0
; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT: s_and_b32 s3, s3, 0xffff0000
; CI-NEXT: s_bitset0_b32 s2, 31
-; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1
+; CI-NEXT: s_bitset0_b32 s3, 31
+; CI-NEXT: s_and_b32 s3, s3, 0xffff0000
; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
+; CI-NEXT: s_xor_b32 s3, s3, 0x80000000
; CI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: s_and_b32 s3, s3, 0xffff0000
; CI-NEXT: s_lshr_b32 s2, s2, 16
-; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; CI-NEXT: v_alignbit_b32 v2, s2, v0, 16
+; CI-NEXT: s_or_b32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -459,24 +462,24 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 16
-; VI-NEXT: v_add_f32_e64 v0, s3, 1.0
+; VI-NEXT: s_and_b32 s3, s2, 0xffff0000
+; VI-NEXT: v_add_f32_e64 v0, s3, 2.0
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; VI-NEXT: s_and_b32 s2, s2, 0xffff0000
+; VI-NEXT: s_lshl_b32 s2, s2, 16
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; VI-NEXT: v_add_f32_e64 v1, s2, 2.0
+; VI-NEXT: v_add_f32_e64 v1, s2, 1.0
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; VI-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -570,9 +573,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x
; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000
; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2
; CI-NEXT: s_lshl_b32 s2, s3, 16
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v2, v1, v0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -631,24 +635,26 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshl_b32 s4, s2, 16
; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
+; CI-NEXT: s_lshl_b32 s5, s3, 16
; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2|
; CI-NEXT: s_and_b32 s2, s3, 0xffff0000
-; CI-NEXT: s_lshl_b32 s5, s3, 16
-; CI-NEXT: v_mul_f32_e64 v3, 1.0, |s2|
; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4|
; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s5|
+; CI-NEXT: v_mul_f32_e64 v3, 1.0, |s2|
; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; CI-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
; CI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; CI-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; CI-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_or_b32_e32 v1, v1, v3
+; CI-NEXT: v_or_b32_e32 v0, v0, v2
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -730,13 +736,14 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s3, s2, 0x7fff0000
-; CI-NEXT: s_and_b32 s2, s2, 0x7fff
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_mul_f32_e64 v0, s3, -4.0
+; CI-NEXT: s_and_b32 s3, s2, 0x7fff
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000
+; CI-NEXT: v_mul_f32_e64 v0, s2, -4.0
+; CI-NEXT: s_lshl_b32 s2, s3, 16
; CI-NEXT: v_mul_f32_e64 v1, s2, -4.0
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v2, v1, v0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -769,8 +776,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_alignbit_b32 v2, v1, v0, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -861,16 +868,17 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0,
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: s_and_b32 s1, s4, 0x7fff
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000
-; CI-NEXT: v_mul_f32_e64 v4, -1.0, s2
; CI-NEXT: s_lshl_b32 s1, s1, 16
+; CI-NEXT: v_mul_f32_e64 v4, -1.0, s2
+; CI-NEXT: v_mul_f32_e64 v5, -1.0, s1
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
-; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; CI-NEXT: v_mul_f32_e64 v5, -1.0, s1
-; CI-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_or_b32_e32 v4, v5, v4
; CI-NEXT: v_mov_b32_e32 v5, s0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_store_dword v[0:1], v5
@@ -943,16 +951,17 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b32 s1, s4, 0x7fff
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000
; CI-NEXT: s_lshl_b32 s1, s1, 16
-; CI-NEXT: v_mul_f32_e64 v4, s1, -4.0
-; CI-NEXT: s_and_b32 s1, s4, 0x7fff0000
+; CI-NEXT: v_mul_f32_e64 v4, s2, -4.0
; CI-NEXT: v_mul_f32_e64 v5, s1, -4.0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
+; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; CI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; CI-NEXT: v_or_b32_e32 v4, v5, v4
; CI-NEXT: v_mov_b32_e32 v5, s0
-; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_store_dword v[0:1], v5
; CI-NEXT: flat_store_dword v[2:3], v4
@@ -988,8 +997,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa
; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 1b092b283290a..f73625132998d 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1382,14 +1382,16 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
index d232693b46ad9..fc976fba4ebab 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
@@ -368,8 +368,9 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
; CI-NEXT: s_lshl_b32 s2, s2, 16
; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3
; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v2, v1, v0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -426,11 +427,12 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
; CI-NEXT: ; def s2
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_and_b32 s3, s2, 0xffff0000
-; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3
; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3
; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
@@ -505,9 +507,10 @@ define amdgpu_kernel void @v_fneg_v2bf16(ptr addrspace(1) %out, ptr addrspace(1)
; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_mul_f32_e32 v3, -1.0, v3
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; CI-NEXT: v_mul_f32_e32 v2, -1.0, v2
-; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -573,8 +576,9 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 {
; CI-NEXT: s_lshl_b32 s2, s2, 16
; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3
; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v2, v1, v0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -637,16 +641,17 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; CI-NEXT: v_mul_f32_e32 v4, -1.0, v3
; CI-NEXT: v_mul_f32_e32 v5, -1.0, v2
-; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; CI-NEXT: v_mul_f32_e32 v3, v4, v3
+; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; CI-NEXT: v_mul_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v4, v3
+; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -682,8 +687,8 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, 16
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index 9a347d71bf430..7f83f8a25ec3e 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -7017,14 +7017,15 @@ define void @freeze_v3bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX6-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-SDAG-NEXT: v_alignbit_b32 v0, v4, v0, 16
+; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -7055,14 +7056,15 @@ define void @freeze_v3bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-SDAG-NEXT: v_alignbit_b32 v0, v4, v0, 16
+; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 43caa4c739fb3..816697cc476e2 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -18,12 +18,12 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: s_lshr_b32 s1, s0, 1
-; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; SI-NEXT: s_not_b32 s0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s0
-; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1
+; SI-NEXT: s_lshl_b32 s0, s0, s2
+; SI-NEXT: s_lshr_b32 s1, s1, 1
+; SI-NEXT: s_not_b32 s2, s2
+; SI-NEXT: s_lshr_b32 s1, s1, s2
+; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -32,14 +32,14 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: s_lshl_b32 s0, s0, s2
+; VI-NEXT: s_lshr_b32 s1, s1, 1
; VI-NEXT: s_not_b32 s2, s2
-; VI-NEXT: s_lshr_b32 s1, s0, 1
-; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1
+; VI-NEXT: s_lshr_b32 s1, s1, s2
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -49,12 +49,12 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_lshl_b32 s0, s0, s2
+; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: s_not_b32 s2, s2
-; GFX9-NEXT: s_lshr_b32 s1, s0, 1
-; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2
+; GFX9-NEXT: s_lshr_b32 s1, s1, s2
+; GFX9-NEXT: s_or_b32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -77,13 +77,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1
-; GFX10-NEXT: s_lshr_b32 s0, s0, 1
-; GFX10-NEXT: s_not_b32 s1, s2
-; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1
-; GFX10-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: s_not_b32 s3, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s2
+; GFX10-NEXT: s_lshr_b32 s1, s1, s3
+; GFX10-NEXT: s_or_b32 s0, s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_i32:
@@ -91,14 +93,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1
-; GFX11-NEXT: s_lshr_b32 s0, s0, 1
-; GFX11-NEXT: s_not_b32 s1, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1
-; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_not_b32 s3, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s2
+; GFX11-NEXT: s_lshr_b32 s1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_endpgm
entry:
%0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@@ -113,10 +116,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25
+; SI-NEXT: s_lshr_b32 s0, s3, 25
+; SI-NEXT: s_lshl_b32 s1, s2, 7
+; SI-NEXT: s_or_b32 s0, s1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -124,10 +129,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25
+; VI-NEXT: s_lshr_b32 s3, s3, 25
+; VI-NEXT: s_lshl_b32 s2, s2, 7
+; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -136,8 +143,10 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25
+; GFX9-NEXT: s_lshr_b32 s3, s3, 25
+; GFX9-NEXT: s_lshl_b32 s2, s2, 7
+; GFX9-NEXT: s_or_b32 s2, s2, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -158,16 +167,22 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25
+; GFX10-NEXT: s_lshr_b32 s3, s3, 25
+; GFX10-NEXT: s_lshl_b32 s2, s2, 7
+; GFX10-NEXT: s_or_b32 s2, s2, s3
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_i32_imm:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25
+; GFX11-NEXT: s_lshr_b32 s3, s3, 25
+; GFX11-NEXT: s_lshl_b32 s2, s2, 7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
entry:
@@ -179,70 +194,70 @@ entry:
define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
; SI-LABEL: fshl_v2i32:
; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
-; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1
-; SI-NEXT: s_not_b32 s3, s5
-; SI-NEXT: s_lshr_b32 s1, s1, 1
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: s_not_b32 s1, s4
-; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; SI-NEXT: s_lshr_b32 s0, s0, 1
-; SI-NEXT: v_mov_b32_e32 v2, s1
-; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_lshl_b32 s1, s1, s9
+; SI-NEXT: s_lshr_b32 s3, s3, 1
+; SI-NEXT: s_not_b32 s9, s9
+; SI-NEXT: s_lshr_b32 s3, s3, s9
+; SI-NEXT: s_or_b32 s1, s1, s3
+; SI-NEXT: s_lshr_b32 s2, s2, 1
+; SI-NEXT: s_not_b32 s3, s8
+; SI-NEXT: s_lshl_b32 s0, s0, s8
+; SI-NEXT: s_lshr_b32 s2, s2, s3
+; SI-NEXT: s_or_b32 s0, s0, s2
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshl_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: s_lshl_b32 s1, s1, s7
+; VI-NEXT: s_lshr_b32 s3, s3, 1
; VI-NEXT: s_not_b32 s7, s7
-; VI-NEXT: s_lshr_b32 s3, s1, 1
-; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_not_b32 s1, s6
-; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; VI-NEXT: s_lshr_b32 s0, s0, 1
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
+; VI-NEXT: s_lshr_b32 s3, s3, s7
+; VI-NEXT: s_or_b32 s1, s1, s3
+; VI-NEXT: s_lshr_b32 s2, s2, 1
+; VI-NEXT: s_not_b32 s3, s6
+; VI-NEXT: s_lshl_b32 s0, s0, s6
+; VI-NEXT: s_lshr_b32 s2, s2, s3
+; VI-NEXT: s_or_b32 s0, s0, s2
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshl_v2i32:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: s_lshr_b32 s3, s1, 1
-; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1
-; GFX9-NEXT: s_not_b32 s1, s9
+; GFX9-NEXT: s_not_b32 s4, s7
+; GFX9-NEXT: s_lshr_b32 s3, s3, 1
+; GFX9-NEXT: s_lshl_b32 s1, s1, s7
+; GFX9-NEXT: s_lshr_b32 s3, s3, s4
+; GFX9-NEXT: s_or_b32 s1, s1, s3
+; GFX9-NEXT: s_lshr_b32 s2, s2, 1
+; GFX9-NEXT: s_not_b32 s3, s6
+; GFX9-NEXT: s_lshl_b32 s0, s0, s6
+; GFX9-NEXT: s_lshr_b32 s2, s2, s3
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_not_b32 s1, s8
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX9-NEXT: s_lshr_b32 s0, s0, 1
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v2i32:
@@ -266,38 +281,46 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX10-LABEL: fshl_v2i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1
-; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1
-; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_not_b32 s2, s7
-; GFX10-NEXT: s_lshr_b32 s0, s0, 1
-; GFX10-NEXT: s_not_b32 s3, s6
-; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2
-; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3
+; GFX10-NEXT: s_not_b32 s4, s7
+; GFX10-NEXT: s_lshr_b32 s3, s3, 1
+; GFX10-NEXT: s_lshr_b32 s2, s2, 1
+; GFX10-NEXT: s_not_b32 s5, s6
+; GFX10-NEXT: s_lshl_b32 s1, s1, s7
+; GFX10-NEXT: s_lshl_b32 s0, s0, s6
+; GFX10-NEXT: s_lshr_b32 s2, s2, s5
+; GFX10-NEXT: s_lshr_b32 s3, s3, s4
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_or_b32 s1, s1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_v2i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1
-; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1
-; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_not_b32 s2, s7
-; GFX11-NEXT: s_lshr_b32 s0, s0, 1
-; GFX11-NEXT: s_not_b32 s3, s6
-; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2
-; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3
+; GFX11-NEXT: s_not_b32 s8, s6
+; GFX11-NEXT: s_lshl_b32 s1, s1, s7
+; GFX11-NEXT: s_lshr_b32 s3, s3, 1
+; GFX11-NEXT: s_not_b32 s7, s7
+; GFX11-NEXT: s_lshr_b32 s2, s2, 1
+; GFX11-NEXT: s_lshl_b32 s0, s0, s6
+; GFX11-NEXT: s_lshr_b32 s2, s2, s8
+; GFX11-NEXT: s_lshr_b32 s3, s3, s7
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_or_b32 s1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
@@ -314,10 +337,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_mov_b32_e32 v2, s2
-; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23
-; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25
+; SI-NEXT: s_lshr_b32 s3, s3, 23
+; SI-NEXT: s_lshl_b32 s1, s1, 9
+; SI-NEXT: s_lshr_b32 s2, s2, 25
+; SI-NEXT: s_lshl_b32 s0, s0, 7
+; SI-NEXT: s_or_b32 s1, s1, s3
+; SI-NEXT: s_or_b32 s0, s0, s2
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -326,11 +353,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23
-; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25
+; VI-NEXT: s_lshr_b32 s3, s3, 23
+; VI-NEXT: s_lshl_b32 s1, s1, 9
+; VI-NEXT: s_lshr_b32 s2, s2, 25
+; VI-NEXT: s_lshl_b32 s0, s0, 7
+; VI-NEXT: s_or_b32 s1, s1, s3
+; VI-NEXT: s_or_b32 s0, s0, s2
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -341,10 +372,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25
+; GFX9-NEXT: s_lshr_b32 s3, s3, 23
+; GFX9-NEXT: s_lshl_b32 s1, s1, 9
+; GFX9-NEXT: s_lshr_b32 s2, s2, 25
+; GFX9-NEXT: s_lshl_b32 s0, s0, 7
+; GFX9-NEXT: s_or_b32 s1, s1, s3
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -369,8 +404,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25
+; GFX10-NEXT: s_lshr_b32 s3, s3, 23
+; GFX10-NEXT: s_lshr_b32 s2, s2, 25
+; GFX10-NEXT: s_lshl_b32 s0, s0, 7
+; GFX10-NEXT: s_lshl_b32 s1, s1, 9
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_or_b32 s1, s1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
@@ -379,10 +420,16 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25
+; GFX11-NEXT: s_lshr_b32 s3, s3, 23
+; GFX11-NEXT: s_lshr_b32 s2, s2, 25
+; GFX11-NEXT: s_lshl_b32 s0, s0, 7
+; GFX11-NEXT: s_lshl_b32 s1, s1, 9
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_or_b32 s1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
@@ -395,36 +442,36 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; SI-LABEL: fshl_v4i32:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
-; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15
-; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_not_b32 s5, s19
-; SI-NEXT: v_mov_b32_e32 v0, s15
-; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1
-; SI-NEXT: s_lshr_b32 s4, s11, 1
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: v_alignbit_b32 v3, s4, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s14
-; SI-NEXT: s_not_b32 s5, s18
-; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1
-; SI-NEXT: s_lshr_b32 s4, s10, 1
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: v_alignbit_b32 v2, s4, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: s_not_b32 s5, s17
-; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1
-; SI-NEXT: s_lshr_b32 s4, s9, 1
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: v_alignbit_b32 v1, s4, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: s_not_b32 s5, s16
-; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1
-; SI-NEXT: s_lshr_b32 s4, s8, 1
-; SI-NEXT: v_mov_b32_e32 v4, s5
-; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_lshr_b32 s15, s15, 1
+; SI-NEXT: s_lshl_b32 s11, s11, s3
+; SI-NEXT: s_not_b32 s3, s3
+; SI-NEXT: s_lshr_b32 s3, s15, s3
+; SI-NEXT: s_or_b32 s3, s11, s3
+; SI-NEXT: s_lshl_b32 s10, s10, s2
+; SI-NEXT: s_lshr_b32 s11, s14, 1
+; SI-NEXT: s_not_b32 s2, s2
+; SI-NEXT: s_lshr_b32 s2, s11, s2
+; SI-NEXT: s_or_b32 s2, s10, s2
+; SI-NEXT: s_lshl_b32 s9, s9, s1
+; SI-NEXT: s_lshr_b32 s10, s13, 1
+; SI-NEXT: s_not_b32 s1, s1
+; SI-NEXT: s_lshr_b32 s1, s10, s1
+; SI-NEXT: s_or_b32 s1, s9, s1
+; SI-NEXT: s_lshl_b32 s8, s8, s0
+; SI-NEXT: s_lshr_b32 s9, s12, 1
+; SI-NEXT: s_not_b32 s0, s0
+; SI-NEXT: s_lshr_b32 s0, s9, s0
+; SI-NEXT: s_or_b32 s0, s8, s0
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v3, s3
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshl_v4i32:
@@ -433,31 +480,31 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s15
+; VI-NEXT: s_lshr_b32 s7, s15, 1
+; VI-NEXT: s_lshl_b32 s6, s11, s3
; VI-NEXT: s_not_b32 s3, s3
-; VI-NEXT: s_lshr_b32 s6, s11, 1
-; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v3, s6, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s14
+; VI-NEXT: s_lshr_b32 s3, s7, s3
+; VI-NEXT: s_or_b32 s3, s6, s3
+; VI-NEXT: s_lshl_b32 s6, s10, s2
+; VI-NEXT: s_lshr_b32 s7, s14, 1
; VI-NEXT: s_not_b32 s2, s2
-; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1
-; VI-NEXT: s_lshr_b32 s3, s10, 1
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s13
+; VI-NEXT: s_lshr_b32 s2, s7, s2
+; VI-NEXT: s_or_b32 s2, s6, s2
+; VI-NEXT: s_lshl_b32 s6, s9, s1
+; VI-NEXT: s_lshr_b32 s7, s13, 1
; VI-NEXT: s_not_b32 s1, s1
-; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1
-; VI-NEXT: s_lshr_b32 s2, s9, 1
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: s_lshr_b32 s1, s7, s1
+; VI-NEXT: s_or_b32 s1, s6, s1
+; VI-NEXT: s_lshl_b32 s6, s8, s0
+; VI-NEXT: s_lshr_b32 s7, s12, 1
; VI-NEXT: s_not_b32 s0, s0
-; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1
-; VI-NEXT: s_lshr_b32 s1, s8, 1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4
+; VI-NEXT: s_lshr_b32 s0, s7, s0
+; VI-NEXT: s_or_b32 s0, s6, s0
; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -469,30 +516,30 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshr_b32 s5, s15, 1
+; GFX9-NEXT: s_lshl_b32 s4, s11, s3
; GFX9-NEXT: s_not_b32 s3, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s15
-; GFX9-NEXT: s_lshr_b32 s4, s11, 1
-; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_alignbit_b32 v3, s4, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s14
+; GFX9-NEXT: s_lshr_b32 s3, s5, s3
+; GFX9-NEXT: s_or_b32 s3, s4, s3
+; GFX9-NEXT: s_lshl_b32 s4, s10, s2
+; GFX9-NEXT: s_lshr_b32 s5, s14, 1
; GFX9-NEXT: s_not_b32 s2, s2
-; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1
-; GFX9-NEXT: s_lshr_b32 s3, s10, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
+; GFX9-NEXT: s_lshr_b32 s2, s5, s2
+; GFX9-NEXT: s_or_b32 s2, s4, s2
+; GFX9-NEXT: s_lshl_b32 s4, s9, s1
+; GFX9-NEXT: s_lshr_b32 s5, s13, 1
; GFX9-NEXT: s_not_b32 s1, s1
-; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1
-; GFX9-NEXT: s_lshr_b32 s2, s9, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-NEXT: s_lshr_b32 s1, s5, s1
+; GFX9-NEXT: s_or_b32 s1, s4, s1
+; GFX9-NEXT: s_lshl_b32 s4, s8, s0
+; GFX9-NEXT: s_lshr_b32 s5, s12, 1
; GFX9-NEXT: s_not_b32 s0, s0
-; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1
-; GFX9-NEXT: s_lshr_b32 s1, s8, 1
-; GFX9-NEXT: v_mov_b32_e32 v5, s0
-; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5
+; GFX9-NEXT: s_lshr_b32 s0, s5, s0
+; GFX9-NEXT: s_or_b32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -530,22 +577,30 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v0, s11, s15, 1
-; GFX10-NEXT: v_alignbit_b32 v1, s10, s14, 1
-; GFX10-NEXT: v_alignbit_b32 v5, s9, s13, 1
-; GFX10-NEXT: v_alignbit_b32 v6, s8, s12, 1
-; GFX10-NEXT: s_lshr_b32 s4, s11, 1
+; GFX10-NEXT: s_lshr_b32 s5, s15, 1
+; GFX10-NEXT: s_lshl_b32 s4, s11, s3
; GFX10-NEXT: s_not_b32 s3, s3
-; GFX10-NEXT: s_lshr_b32 s5, s10, 1
+; GFX10-NEXT: s_lshl_b32 s10, s10, s2
+; GFX10-NEXT: s_lshr_b32 s11, s14, 1
; GFX10-NEXT: s_not_b32 s2, s2
-; GFX10-NEXT: s_lshr_b32 s9, s9, 1
+; GFX10-NEXT: s_lshl_b32 s9, s9, s1
+; GFX10-NEXT: s_lshr_b32 s13, s13, 1
; GFX10-NEXT: s_not_b32 s1, s1
-; GFX10-NEXT: s_lshr_b32 s8, s8, 1
+; GFX10-NEXT: s_lshl_b32 s8, s8, s0
+; GFX10-NEXT: s_lshr_b32 s12, s12, 1
; GFX10-NEXT: s_not_b32 s0, s0
-; GFX10-NEXT: v_alignbit_b32 v3, s4, v0, s3
-; GFX10-NEXT: v_alignbit_b32 v2, s5, v1, s2
-; GFX10-NEXT: v_alignbit_b32 v1, s9, v5, s1
-; GFX10-NEXT: v_alignbit_b32 v0, s8, v6, s0
+; GFX10-NEXT: s_lshr_b32 s3, s5, s3
+; GFX10-NEXT: s_lshr_b32 s2, s11, s2
+; GFX10-NEXT: s_lshr_b32 s1, s13, s1
+; GFX10-NEXT: s_lshr_b32 s0, s12, s0
+; GFX10-NEXT: s_or_b32 s3, s4, s3
+; GFX10-NEXT: s_or_b32 s2, s10, s2
+; GFX10-NEXT: s_or_b32 s0, s8, s0
+; GFX10-NEXT: s_or_b32 s1, s9, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX10-NEXT: s_endpgm
;
@@ -555,24 +610,31 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v0, s11, s15, 1
-; GFX11-NEXT: v_alignbit_b32 v1, s10, s14, 1
-; GFX11-NEXT: v_alignbit_b32 v5, s9, s13, 1
-; GFX11-NEXT: v_alignbit_b32 v6, s8, s12, 1
-; GFX11-NEXT: s_lshr_b32 s6, s11, 1
+; GFX11-NEXT: s_lshr_b32 s7, s15, 1
+; GFX11-NEXT: s_lshl_b32 s6, s11, s3
; GFX11-NEXT: s_not_b32 s3, s3
-; GFX11-NEXT: s_lshr_b32 s7, s10, 1
+; GFX11-NEXT: s_lshl_b32 s10, s10, s2
+; GFX11-NEXT: s_lshr_b32 s11, s14, 1
; GFX11-NEXT: s_not_b32 s2, s2
-; GFX11-NEXT: s_lshr_b32 s9, s9, 1
+; GFX11-NEXT: s_lshl_b32 s9, s9, s1
+; GFX11-NEXT: s_lshr_b32 s13, s13, 1
; GFX11-NEXT: s_not_b32 s1, s1
-; GFX11-NEXT: s_lshr_b32 s8, s8, 1
+; GFX11-NEXT: s_lshl_b32 s8, s8, s0
+; GFX11-NEXT: s_lshr_b32 s12, s12, 1
; GFX11-NEXT: s_not_b32 s0, s0
-; GFX11-NEXT: v_alignbit_b32 v3, s6, v0, s3
-; GFX11-NEXT: v_alignbit_b32 v2, s7, v1, s2
-; GFX11-NEXT: v_alignbit_b32 v1, s9, v5, s1
-; GFX11-NEXT: v_alignbit_b32 v0, s8, v6, s0
+; GFX11-NEXT: s_lshr_b32 s3, s7, s3
+; GFX11-NEXT: s_lshr_b32 s2, s11, s2
+; GFX11-NEXT: s_lshr_b32 s1, s13, s1
+; GFX11-NEXT: s_lshr_b32 s0, s12, s0
+; GFX11-NEXT: s_or_b32 s3, s6, s3
+; GFX11-NEXT: s_or_b32 s2, s10, s2
+; GFX11-NEXT: s_or_b32 s0, s8, s0
+; GFX11-NEXT: s_or_b32 s1, s9, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
@@ -589,14 +651,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s15
-; SI-NEXT: v_mov_b32_e32 v1, s14
-; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_alignbit_b32 v2, s10, v1, 23
-; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31
+; SI-NEXT: s_lshr_b32 s4, s15, 31
+; SI-NEXT: s_lshl_b32 s5, s11, 1
+; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: s_lshr_b32 s5, s14, 23
+; SI-NEXT: s_lshl_b32 s6, s10, 9
+; SI-NEXT: s_or_b32 s5, s6, s5
+; SI-NEXT: s_lshr_b32 s6, s13, 25
+; SI-NEXT: s_lshl_b32 s7, s9, 7
+; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_lshr_b32 s7, s12, 31
+; SI-NEXT: s_lshl_b32 s8, s8, 1
+; SI-NEXT: s_or_b32 s7, s8, s7
+; SI-NEXT: v_mov_b32_e32 v0, s7
+; SI-NEXT: v_mov_b32_e32 v1, s6
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -605,15 +675,23 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s15
-; VI-NEXT: v_mov_b32_e32 v1, s14
-; VI-NEXT: v_mov_b32_e32 v4, s13
-; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31
-; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23
-; VI-NEXT: v_alignbit_b32 v1, s9, v4, 25
-; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: s_lshr_b32 s2, s15, 31
+; VI-NEXT: s_lshl_b32 s3, s11, 1
+; VI-NEXT: s_lshr_b32 s4, s14, 23
+; VI-NEXT: s_lshl_b32 s5, s10, 9
+; VI-NEXT: s_or_b32 s2, s3, s2
+; VI-NEXT: s_or_b32 s3, s5, s4
+; VI-NEXT: s_lshr_b32 s4, s13, 25
+; VI-NEXT: s_lshl_b32 s5, s9, 7
+; VI-NEXT: s_or_b32 s4, s5, s4
+; VI-NEXT: s_lshr_b32 s5, s12, 31
+; VI-NEXT: s_lshl_b32 s6, s8, 1
+; VI-NEXT: s_or_b32 s5, s6, s5
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31
+; VI-NEXT: v_mov_b32_e32 v0, s5
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -624,14 +702,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s15
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23
-; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25
-; GFX9-NEXT: v_mov_b32_e32 v0, s12
-; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31
+; GFX9-NEXT: s_lshr_b32 s2, s15, 31
+; GFX9-NEXT: s_lshl_b32 s3, s11, 1
+; GFX9-NEXT: s_lshr_b32 s4, s14, 23
+; GFX9-NEXT: s_or_b32 s2, s3, s2
+; GFX9-NEXT: s_lshl_b32 s3, s10, 9
+; GFX9-NEXT: s_or_b32 s3, s3, s4
+; GFX9-NEXT: s_lshr_b32 s4, s13, 25
+; GFX9-NEXT: s_lshl_b32 s5, s9, 7
+; GFX9-NEXT: s_or_b32 s4, s5, s4
+; GFX9-NEXT: s_lshr_b32 s5, s12, 31
+; GFX9-NEXT: s_lshl_b32 s6, s8, 1
+; GFX9-NEXT: s_or_b32 s5, s6, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -660,10 +746,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 31
-; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 23
-; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 25
-; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 31
+; GFX10-NEXT: s_lshr_b32 s2, s15, 31
+; GFX10-NEXT: s_lshl_b32 s3, s11, 1
+; GFX10-NEXT: s_lshr_b32 s4, s14, 23
+; GFX10-NEXT: s_lshl_b32 s5, s10, 9
+; GFX10-NEXT: s_lshr_b32 s6, s13, 25
+; GFX10-NEXT: s_lshl_b32 s7, s9, 7
+; GFX10-NEXT: s_lshr_b32 s9, s12, 31
+; GFX10-NEXT: s_lshl_b32 s8, s8, 1
+; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: s_or_b32 s3, s5, s4
+; GFX10-NEXT: s_or_b32 s4, s8, s9
+; GFX10-NEXT: s_or_b32 s5, s7, s6
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: v_mov_b32_e32 v2, s3
+; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -672,12 +770,23 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 31
-; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 23
-; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 25
-; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 31
+; GFX11-NEXT: s_lshr_b32 s2, s15, 31
+; GFX11-NEXT: s_lshl_b32 s3, s11, 1
+; GFX11-NEXT: s_lshr_b32 s4, s14, 23
+; GFX11-NEXT: s_lshl_b32 s5, s10, 9
+; GFX11-NEXT: s_lshr_b32 s6, s13, 25
+; GFX11-NEXT: s_lshl_b32 s7, s9, 7
+; GFX11-NEXT: s_lshr_b32 s9, s12, 31
+; GFX11-NEXT: s_lshl_b32 s8, s8, 1
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_or_b32 s3, s5, s4
+; GFX11-NEXT: s_or_b32 s4, s8, s9
+; GFX11-NEXT: s_or_b32 s5, s7, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 4a79096442c96..123b5f8f74115 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -30,9 +30,12 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_mov_b32_e32 v1, s2
-; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1
+; SI-NEXT: s_lshr_b32 s1, s1, s2
+; SI-NEXT: s_lshl_b32 s0, s0, 1
+; SI-NEXT: s_not_b32 s2, s2
+; SI-NEXT: s_lshl_b32 s0, s0, s2
+; SI-NEXT: s_or_b32 s0, s0, s1
+; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -41,11 +44,14 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1
+; VI-NEXT: s_lshr_b32 s1, s1, s2
+; VI-NEXT: s_lshl_b32 s0, s0, 1
+; VI-NEXT: s_not_b32 s2, s2
+; VI-NEXT: s_lshl_b32 s0, s0, s2
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -55,9 +61,12 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2
+; GFX9-NEXT: s_lshr_b32 s1, s1, s2
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_not_b32 s2, s2
+; GFX9-NEXT: s_lshl_b32 s0, s0, s2
+; GFX9-NEXT: s_or_b32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -77,62 +86,48 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_not_b32 s3, s2
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s3
+; GFX10-NEXT: s_or_b32 s0, s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[6:7]
; GFX10-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: fshr_i32:
-; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: fshr_i32:
-; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0
-; GFX11-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5]
-; GFX11-FAKE16-NEXT: s_endpgm
-;
-; GFX12-TRUE16-LABEL: fshr_i32:
-; GFX12-TRUE16: ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT: s_clause 0x1
-; GFX12-TRUE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l
-; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: fshr_i32:
-; GFX12-FAKE16: ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT: s_clause 0x1
-; GFX12-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0
-; GFX12-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: fshr_i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_not_b32 s3, s2
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: fshr_i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s0, s0, 1
+; GFX12-NEXT: s_not_b32 s3, s2
+; GFX12-NEXT: s_lshr_b32 s1, s1, s2
+; GFX12-NEXT: s_lshl_b32 s0, s0, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s0, s0, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX12-NEXT: s_endpgm
entry:
%0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
store i32 %0, ptr addrspace(1) %in
@@ -146,10 +141,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7
+; SI-NEXT: s_lshr_b32 s0, s3, 7
+; SI-NEXT: s_lshl_b32 s1, s2, 25
+; SI-NEXT: s_or_b32 s0, s1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -157,10 +154,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7
+; VI-NEXT: s_lshr_b32 s3, s3, 7
+; VI-NEXT: s_lshl_b32 s2, s2, 25
+; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -169,8 +168,10 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7
+; GFX9-NEXT: s_lshr_b32 s3, s3, 7
+; GFX9-NEXT: s_lshl_b32 s2, s2, 25
+; GFX9-NEXT: s_or_b32 s2, s2, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -191,25 +192,34 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7
+; GFX10-NEXT: s_lshr_b32 s3, s3, 7
+; GFX10-NEXT: s_lshl_b32 s2, s2, 25
+; GFX10-NEXT: s_or_b32 s2, s2, s3
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshr_i32_imm:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7
+; GFX11-NEXT: s_lshr_b32 s3, s3, 7
+; GFX11-NEXT: s_lshl_b32 s2, s2, 25
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fshr_i32_imm:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_alignbit_b32 v1, s2, s3, 7
+; GFX12-NEXT: s_lshr_b32 s3, s3, 7
+; GFX12-NEXT: s_lshl_b32 s2, s2, 25
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s2, s2, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -221,51 +231,69 @@ entry:
define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
; SI-LABEL: fshr_v2i32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_mov_b32_e32 v1, s9
-; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v2, s8
-; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2
+; SI-NEXT: s_lshr_b32 s3, s3, s9
+; SI-NEXT: s_lshl_b32 s1, s1, 1
+; SI-NEXT: s_not_b32 s9, s9
+; SI-NEXT: s_lshl_b32 s1, s1, s9
+; SI-NEXT: s_or_b32 s1, s1, s3
+; SI-NEXT: s_lshl_b32 s0, s0, 1
+; SI-NEXT: s_not_b32 s3, s8
+; SI-NEXT: s_lshr_b32 s2, s2, s8
+; SI-NEXT: s_lshl_b32 s0, s0, s3
+; SI-NEXT: s_or_b32 s0, s0, s2
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshr_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0
+; VI-NEXT: s_lshr_b32 s3, s3, s7
+; VI-NEXT: s_lshl_b32 s1, s1, 1
+; VI-NEXT: s_not_b32 s7, s7
+; VI-NEXT: s_lshl_b32 s1, s1, s7
+; VI-NEXT: s_or_b32 s1, s1, s3
+; VI-NEXT: s_lshl_b32 s0, s0, 1
+; VI-NEXT: s_not_b32 s3, s6
+; VI-NEXT: s_lshr_b32 s2, s2, s6
+; VI-NEXT: s_lshl_b32 s0, s0, s3
+; VI-NEXT: s_or_b32 s0, s0, s2
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshr_v2i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3
+; GFX9-NEXT: s_not_b32 s4, s7
+; GFX9-NEXT: s_lshl_b32 s1, s1, 1
+; GFX9-NEXT: s_lshr_b32 s3, s3, s7
+; GFX9-NEXT: s_lshl_b32 s1, s1, s4
+; GFX9-NEXT: s_or_b32 s1, s1, s3
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_not_b32 s3, s6
+; GFX9-NEXT: s_lshr_b32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b32 s0, s0, s3
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX9-NEXT: s_endpgm
;
@@ -289,76 +317,68 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s7
-; GFX10-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2
-; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9]
+; GFX10-NEXT: s_not_b32 s4, s7
+; GFX10-NEXT: s_lshl_b32 s1, s1, 1
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_not_b32 s5, s6
+; GFX10-NEXT: s_lshr_b32 s3, s3, s7
+; GFX10-NEXT: s_lshr_b32 s2, s2, s6
+; GFX10-NEXT: s_lshl_b32 s0, s0, s5
+; GFX10-NEXT: s_lshl_b32 s1, s1, s4
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_or_b32 s1, s1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX10-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: fshr_v2i32:
-; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_clause 0x2
-; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h
-; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: fshr_v2i32:
-; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2
-; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5]
-; GFX11-FAKE16-NEXT: s_endpgm
-;
-; GFX12-TRUE16-LABEL: fshr_v2i32:
-; GFX12-TRUE16: ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT: s_clause 0x2
-; GFX12-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h
-; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: fshr_v2i32:
-; GFX12-FAKE16: ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT: s_clause 0x2
-; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2
-; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: fshr_v2i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_not_b32 s8, s6
+; GFX11-NEXT: s_lshr_b32 s3, s3, s7
+; GFX11-NEXT: s_lshl_b32 s1, s1, 1
+; GFX11-NEXT: s_not_b32 s7, s7
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_lshr_b32 s2, s2, s6
+; GFX11-NEXT: s_lshl_b32 s0, s0, s8
+; GFX11-NEXT: s_lshl_b32 s1, s1, s7
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_or_b32 s1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: fshr_v2i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_not_b32 s8, s6
+; GFX12-NEXT: s_lshr_b32 s3, s3, s7
+; GFX12-NEXT: s_lshl_b32 s1, s1, 1
+; GFX12-NEXT: s_not_b32 s7, s7
+; GFX12-NEXT: s_lshl_b32 s0, s0, 1
+; GFX12-NEXT: s_lshr_b32 s2, s2, s6
+; GFX12-NEXT: s_lshl_b32 s0, s0, s8
+; GFX12-NEXT: s_lshl_b32 s1, s1, s7
+; GFX12-NEXT: s_or_b32 s0, s0, s2
+; GFX12-NEXT: s_or_b32 s1, s1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
store <2 x i32> %0, ptr addrspace(1) %in
@@ -373,10 +393,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_mov_b32_e32 v2, s2
-; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9
-; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7
+; SI-NEXT: s_lshr_b32 s3, s3, 9
+; SI-NEXT: s_lshl_b32 s1, s1, 23
+; SI-NEXT: s_lshr_b32 s2, s2, 7
+; SI-NEXT: s_lshl_b32 s0, s0, 25
+; SI-NEXT: s_or_b32 s1, s1, s3
+; SI-NEXT: s_or_b32 s0, s0, s2
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -385,11 +409,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9
-; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7
+; VI-NEXT: s_lshr_b32 s3, s3, 9
+; VI-NEXT: s_lshl_b32 s1, s1, 23
+; VI-NEXT: s_lshr_b32 s2, s2, 7
+; VI-NEXT: s_lshl_b32 s0, s0, 25
+; VI-NEXT: s_or_b32 s1, s1, s3
+; VI-NEXT: s_or_b32 s0, s0, s2
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -400,10 +428,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7
+; GFX9-NEXT: s_lshr_b32 s3, s3, 9
+; GFX9-NEXT: s_lshl_b32 s1, s1, 23
+; GFX9-NEXT: s_lshr_b32 s2, s2, 7
+; GFX9-NEXT: s_lshl_b32 s0, s0, 25
+; GFX9-NEXT: s_or_b32 s1, s1, s3
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -428,8 +460,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7
+; GFX10-NEXT: s_lshr_b32 s3, s3, 9
+; GFX10-NEXT: s_lshr_b32 s2, s2, 7
+; GFX10-NEXT: s_lshl_b32 s0, s0, 25
+; GFX10-NEXT: s_lshl_b32 s1, s1, 23
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_or_b32 s1, s1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
@@ -438,10 +476,16 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7
+; GFX11-NEXT: s_lshr_b32 s3, s3, 9
+; GFX11-NEXT: s_lshr_b32 s2, s2, 7
+; GFX11-NEXT: s_lshl_b32 s0, s0, 25
+; GFX11-NEXT: s_lshl_b32 s1, s1, 23
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_or_b32 s1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
@@ -450,10 +494,16 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_alignbit_b32 v1, s1, s3, 9
-; GFX12-NEXT: v_alignbit_b32 v0, s0, s2, 7
+; GFX12-NEXT: s_lshr_b32 s3, s3, 9
+; GFX12-NEXT: s_lshr_b32 s2, s2, 7
+; GFX12-NEXT: s_lshl_b32 s0, s0, 25
+; GFX12-NEXT: s_lshl_b32 s1, s1, 23
+; GFX12-NEXT: s_or_b32 s0, s0, s2
+; GFX12-NEXT: s_or_b32 s1, s1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_endpgm
entry:
@@ -471,18 +521,30 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s15
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s14
-; SI-NEXT: v_mov_b32_e32 v1, s2
-; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s13
+; SI-NEXT: s_lshl_b32 s11, s11, 1
+; SI-NEXT: s_lshr_b32 s15, s15, s3
+; SI-NEXT: s_not_b32 s3, s3
+; SI-NEXT: s_lshl_b32 s3, s11, s3
+; SI-NEXT: s_lshr_b32 s11, s14, s2
+; SI-NEXT: s_lshl_b32 s10, s10, 1
+; SI-NEXT: s_not_b32 s2, s2
+; SI-NEXT: s_lshl_b32 s2, s10, s2
+; SI-NEXT: s_lshr_b32 s10, s13, s1
+; SI-NEXT: s_lshl_b32 s9, s9, 1
+; SI-NEXT: s_not_b32 s1, s1
+; SI-NEXT: s_lshl_b32 s1, s9, s1
+; SI-NEXT: s_lshr_b32 s9, s12, s0
+; SI-NEXT: s_lshl_b32 s8, s8, 1
+; SI-NEXT: s_not_b32 s0, s0
+; SI-NEXT: s_lshl_b32 s0, s8, s0
+; SI-NEXT: s_or_b32 s3, s3, s15
+; SI-NEXT: s_or_b32 s2, s2, s11
+; SI-NEXT: s_or_b32 s1, s1, s10
+; SI-NEXT: s_or_b32 s0, s0, s9
+; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_mov_b32_e32 v4, s0
-; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -492,19 +554,31 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s15
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s14
-; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_alignbit_b32 v2, s10, v2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s13
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_alignbit_b32 v1, s9, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s12
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_alignbit_b32 v0, s8, v0, v4
+; VI-NEXT: s_lshl_b32 s7, s11, 1
+; VI-NEXT: s_lshr_b32 s6, s15, s3
+; VI-NEXT: s_not_b32 s3, s3
+; VI-NEXT: s_lshl_b32 s3, s7, s3
+; VI-NEXT: s_or_b32 s3, s3, s6
+; VI-NEXT: s_lshr_b32 s6, s14, s2
+; VI-NEXT: s_lshl_b32 s7, s10, 1
+; VI-NEXT: s_not_b32 s2, s2
+; VI-NEXT: s_lshl_b32 s2, s7, s2
+; VI-NEXT: s_or_b32 s2, s2, s6
+; VI-NEXT: s_lshr_b32 s6, s13, s1
+; VI-NEXT: s_lshl_b32 s7, s9, 1
+; VI-NEXT: s_not_b32 s1, s1
+; VI-NEXT: s_lshl_b32 s1, s7, s1
+; VI-NEXT: s_or_b32 s1, s1, s6
+; VI-NEXT: s_lshr_b32 s6, s12, s0
+; VI-NEXT: s_lshl_b32 s7, s8, 1
+; VI-NEXT: s_not_b32 s0, s0
+; VI-NEXT: s_lshl_b32 s0, s7, s0
+; VI-NEXT: s_or_b32 s0, s0, s6
; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -516,18 +590,30 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s15
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s14
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_alignbit_b32 v2, s10, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
+; GFX9-NEXT: s_lshl_b32 s5, s11, 1
+; GFX9-NEXT: s_lshr_b32 s4, s15, s3
+; GFX9-NEXT: s_not_b32 s3, s3
+; GFX9-NEXT: s_lshl_b32 s3, s5, s3
+; GFX9-NEXT: s_or_b32 s3, s3, s4
+; GFX9-NEXT: s_lshr_b32 s4, s14, s2
+; GFX9-NEXT: s_lshl_b32 s5, s10, 1
+; GFX9-NEXT: s_not_b32 s2, s2
+; GFX9-NEXT: s_lshl_b32 s2, s5, s2
+; GFX9-NEXT: s_or_b32 s2, s2, s4
+; GFX9-NEXT: s_lshr_b32 s4, s13, s1
+; GFX9-NEXT: s_lshl_b32 s5, s9, 1
+; GFX9-NEXT: s_not_b32 s1, s1
+; GFX9-NEXT: s_lshl_b32 s1, s5, s1
+; GFX9-NEXT: s_or_b32 s1, s1, s4
+; GFX9-NEXT: s_lshr_b32 s4, s12, s0
+; GFX9-NEXT: s_lshl_b32 s5, s8, 1
+; GFX9-NEXT: s_not_b32 s0, s0
+; GFX9-NEXT: s_lshl_b32 s0, s5, s0
+; GFX9-NEXT: s_or_b32 s0, s0, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s12
-; GFX9-NEXT: v_mov_b32_e32 v5, s0
-; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, v5
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX9-NEXT: s_endpgm
;
@@ -552,101 +638,105 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX10-LABEL: fshr_v4i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v6, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: v_mov_b32_e32 v4, s1
-; GFX10-NEXT: v_mov_b32_e32 v5, s0
-; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, v0
-; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, v1
-; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, v4
-; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, v5
-; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX10-NEXT: s_lshl_b32 s5, s11, 1
+; GFX10-NEXT: s_lshr_b32 s4, s15, s3
+; GFX10-NEXT: s_not_b32 s3, s3
+; GFX10-NEXT: s_lshr_b32 s11, s14, s2
+; GFX10-NEXT: s_lshl_b32 s10, s10, 1
+; GFX10-NEXT: s_not_b32 s2, s2
+; GFX10-NEXT: s_lshr_b32 s13, s13, s1
+; GFX10-NEXT: s_lshl_b32 s9, s9, 1
+; GFX10-NEXT: s_not_b32 s1, s1
+; GFX10-NEXT: s_lshr_b32 s12, s12, s0
+; GFX10-NEXT: s_lshl_b32 s8, s8, 1
+; GFX10-NEXT: s_not_b32 s0, s0
+; GFX10-NEXT: s_lshl_b32 s3, s5, s3
+; GFX10-NEXT: s_lshl_b32 s2, s10, s2
+; GFX10-NEXT: s_lshl_b32 s1, s9, s1
+; GFX10-NEXT: s_lshl_b32 s0, s8, s0
+; GFX10-NEXT: s_or_b32 s3, s3, s4
+; GFX10-NEXT: s_or_b32 s2, s2, s11
+; GFX10-NEXT: s_or_b32 s0, s0, s12
+; GFX10-NEXT: s_or_b32 s1, s1, s13
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX10-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: fshr_v4i32:
-; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_clause 0x2
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
-; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, v0.l
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, v1.l
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, v4.l
-; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[4:5]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: fshr_v4i32:
-; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
-; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
-; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5
-; GFX11-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5]
-; GFX11-FAKE16-NEXT: s_endpgm
-;
-; GFX12-TRUE16-LABEL: fshr_v4i32:
-; GFX12-TRUE16: ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT: s_clause 0x2
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
-; GFX12-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
-; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, v0.l
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, v0.h
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, v1.l
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, v4.l
-; GFX12-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[4:5]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: fshr_v4i32:
-; GFX12-FAKE16: ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT: s_clause 0x2
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
-; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
-; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5
-; GFX12-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: fshr_v4i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshl_b32 s7, s11, 1
+; GFX11-NEXT: s_lshr_b32 s6, s15, s3
+; GFX11-NEXT: s_not_b32 s3, s3
+; GFX11-NEXT: s_lshr_b32 s11, s14, s2
+; GFX11-NEXT: s_lshl_b32 s10, s10, 1
+; GFX11-NEXT: s_not_b32 s2, s2
+; GFX11-NEXT: s_lshr_b32 s13, s13, s1
+; GFX11-NEXT: s_lshl_b32 s9, s9, 1
+; GFX11-NEXT: s_not_b32 s1, s1
+; GFX11-NEXT: s_lshr_b32 s12, s12, s0
+; GFX11-NEXT: s_lshl_b32 s8, s8, 1
+; GFX11-NEXT: s_not_b32 s0, s0
+; GFX11-NEXT: s_lshl_b32 s3, s7, s3
+; GFX11-NEXT: s_lshl_b32 s2, s10, s2
+; GFX11-NEXT: s_lshl_b32 s1, s9, s1
+; GFX11-NEXT: s_lshl_b32 s0, s8, s0
+; GFX11-NEXT: s_or_b32 s3, s3, s6
+; GFX11-NEXT: s_or_b32 s2, s2, s11
+; GFX11-NEXT: s_or_b32 s0, s0, s12
+; GFX11-NEXT: s_or_b32 s1, s1, s13
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: fshr_v4i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s7, s11, 1
+; GFX12-NEXT: s_lshr_b32 s6, s15, s3
+; GFX12-NEXT: s_not_b32 s3, s3
+; GFX12-NEXT: s_lshr_b32 s11, s14, s2
+; GFX12-NEXT: s_lshl_b32 s10, s10, 1
+; GFX12-NEXT: s_not_b32 s2, s2
+; GFX12-NEXT: s_lshr_b32 s13, s13, s1
+; GFX12-NEXT: s_lshl_b32 s9, s9, 1
+; GFX12-NEXT: s_not_b32 s1, s1
+; GFX12-NEXT: s_lshr_b32 s12, s12, s0
+; GFX12-NEXT: s_lshl_b32 s8, s8, 1
+; GFX12-NEXT: s_not_b32 s0, s0
+; GFX12-NEXT: s_lshl_b32 s3, s7, s3
+; GFX12-NEXT: s_lshl_b32 s2, s10, s2
+; GFX12-NEXT: s_lshl_b32 s1, s9, s1
+; GFX12-NEXT: s_lshl_b32 s0, s8, s0
+; GFX12-NEXT: s_or_b32 s3, s3, s6
+; GFX12-NEXT: s_or_b32 s2, s2, s11
+; GFX12-NEXT: s_or_b32 s0, s0, s12
+; GFX12-NEXT: s_or_b32 s1, s1, s13
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
+; GFX12-NEXT: s_endpgm
entry:
%0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
store <4 x i32> %0, ptr addrspace(1) %in
@@ -661,14 +751,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s15
-; SI-NEXT: v_mov_b32_e32 v1, s14
-; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_alignbit_b32 v2, s10, v1, 9
-; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1
+; SI-NEXT: s_lshr_b32 s4, s15, 1
+; SI-NEXT: s_lshl_b32 s5, s11, 31
+; SI-NEXT: s_or_b32 s4, s5, s4
+; SI-NEXT: s_lshr_b32 s5, s14, 9
+; SI-NEXT: s_lshl_b32 s6, s10, 23
+; SI-NEXT: s_or_b32 s5, s6, s5
+; SI-NEXT: s_lshr_b32 s6, s13, 7
+; SI-NEXT: s_lshl_b32 s7, s9, 25
+; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_lshr_b32 s7, s12, 1
+; SI-NEXT: s_lshl_b32 s8, s8, 31
+; SI-NEXT: s_or_b32 s7, s8, s7
+; SI-NEXT: v_mov_b32_e32 v0, s7
+; SI-NEXT: v_mov_b32_e32 v1, s6
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -677,15 +775,23 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s15
-; VI-NEXT: v_mov_b32_e32 v1, s14
-; VI-NEXT: v_mov_b32_e32 v4, s13
-; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1
-; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9
-; VI-NEXT: v_alignbit_b32 v1, s9, v4, 7
-; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: s_lshr_b32 s2, s15, 1
+; VI-NEXT: s_lshl_b32 s3, s11, 31
+; VI-NEXT: s_lshr_b32 s4, s14, 9
+; VI-NEXT: s_lshl_b32 s5, s10, 23
+; VI-NEXT: s_or_b32 s2, s3, s2
+; VI-NEXT: s_or_b32 s3, s5, s4
+; VI-NEXT: s_lshr_b32 s4, s13, 7
+; VI-NEXT: s_lshl_b32 s5, s9, 25
+; VI-NEXT: s_or_b32 s4, s5, s4
+; VI-NEXT: s_lshr_b32 s5, s12, 1
+; VI-NEXT: s_lshl_b32 s6, s8, 31
+; VI-NEXT: s_or_b32 s5, s6, s5
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1
+; VI-NEXT: v_mov_b32_e32 v0, s5
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
@@ -696,14 +802,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s15
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 1
-; GFX9-NEXT: v_mov_b32_e32 v0, s13
-; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 9
-; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 7
-; GFX9-NEXT: v_mov_b32_e32 v0, s12
-; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1
+; GFX9-NEXT: s_lshr_b32 s2, s15, 1
+; GFX9-NEXT: s_lshl_b32 s3, s11, 31
+; GFX9-NEXT: s_lshr_b32 s4, s14, 9
+; GFX9-NEXT: s_or_b32 s2, s3, s2
+; GFX9-NEXT: s_lshl_b32 s3, s10, 23
+; GFX9-NEXT: s_or_b32 s3, s3, s4
+; GFX9-NEXT: s_lshr_b32 s4, s13, 7
+; GFX9-NEXT: s_lshl_b32 s5, s9, 25
+; GFX9-NEXT: s_or_b32 s4, s5, s4
+; GFX9-NEXT: s_lshr_b32 s5, s12, 1
+; GFX9-NEXT: s_lshl_b32 s6, s8, 31
+; GFX9-NEXT: s_or_b32 s5, s6, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -730,10 +844,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 1
-; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 9
-; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 7
-; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 1
+; GFX10-NEXT: s_lshr_b32 s2, s15, 1
+; GFX10-NEXT: s_lshl_b32 s3, s11, 31
+; GFX10-NEXT: s_lshr_b32 s4, s14, 9
+; GFX10-NEXT: s_lshl_b32 s5, s10, 23
+; GFX10-NEXT: s_lshr_b32 s6, s13, 7
+; GFX10-NEXT: s_lshl_b32 s7, s9, 25
+; GFX10-NEXT: s_lshr_b32 s9, s12, 1
+; GFX10-NEXT: s_lshl_b32 s8, s8, 31
+; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: s_or_b32 s3, s5, s4
+; GFX10-NEXT: s_or_b32 s4, s8, s9
+; GFX10-NEXT: s_or_b32 s5, s7, s6
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: v_mov_b32_e32 v2, s3
+; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -742,12 +868,23 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 1
-; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 9
-; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 7
-; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 1
+; GFX11-NEXT: s_lshr_b32 s2, s15, 1
+; GFX11-NEXT: s_lshl_b32 s3, s11, 31
+; GFX11-NEXT: s_lshr_b32 s4, s14, 9
+; GFX11-NEXT: s_lshl_b32 s5, s10, 23
+; GFX11-NEXT: s_lshr_b32 s6, s13, 7
+; GFX11-NEXT: s_lshl_b32 s7, s9, 25
+; GFX11-NEXT: s_lshr_b32 s9, s12, 1
+; GFX11-NEXT: s_lshl_b32 s8, s8, 31
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_or_b32 s3, s5, s4
+; GFX11-NEXT: s_or_b32 s4, s8, s9
+; GFX11-NEXT: s_or_b32 s5, s7, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
@@ -756,12 +893,23 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_alignbit_b32 v3, s11, s15, 1
-; GFX12-NEXT: v_alignbit_b32 v2, s10, s14, 9
-; GFX12-NEXT: v_alignbit_b32 v1, s9, s13, 7
-; GFX12-NEXT: v_alignbit_b32 v0, s8, s12, 1
+; GFX12-NEXT: s_lshr_b32 s2, s15, 1
+; GFX12-NEXT: s_lshl_b32 s3, s11, 31
+; GFX12-NEXT: s_lshr_b32 s4, s14, 9
+; GFX12-NEXT: s_lshl_b32 s5, s10, 23
+; GFX12-NEXT: s_lshr_b32 s6, s13, 7
+; GFX12-NEXT: s_lshl_b32 s7, s9, 25
+; GFX12-NEXT: s_lshr_b32 s9, s12, 1
+; GFX12-NEXT: s_lshl_b32 s8, s8, 31
+; GFX12-NEXT: s_or_b32 s2, s3, s2
+; GFX12-NEXT: s_or_b32 s3, s5, s4
+; GFX12-NEXT: s_or_b32 s4, s8, s9
+; GFX12-NEXT: s_or_b32 s5, s7, s6
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1022,9 +1170,11 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
; SI-LABEL: v_fshr_i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_or_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_and_b32_e32 v1, 15, v2
+; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_i16:
@@ -1121,16 +1271,19 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
; SI-LABEL: v_fshr_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_or_b32_e32 v5, 16, v5
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: v_and_b32_e32 v2, 15, v4
+; SI-NEXT: v_bfe_u32 v0, v0, v2, 16
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v2, 15, v5
+; SI-NEXT: v_lshrrev_b32_e32 v3, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v4, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v3
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_bfe_u32 v1, v1, v2, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v2i16:
@@ -1219,20 +1372,25 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
; SI-LABEL: v_fshr_v3i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_or_b32_e32 v7, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7
-; SI-NEXT: v_or_b32_e32 v4, 16, v6
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4
-; SI-NEXT: v_or_b32_e32 v3, 16, v8
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v3
+; SI-NEXT: v_and_b32_e32 v3, 15, v6
+; SI-NEXT: v_bfe_u32 v0, v0, v3, 16
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v4
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3
+; SI-NEXT: v_or_b32_e32 v1, v1, v3
+; SI-NEXT: v_and_b32_e32 v3, 15, v7
+; SI-NEXT: v_lshrrev_b32_e32 v1, v3, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_and_b32_e32 v3, 15, v8
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v4, v3, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
+; SI-NEXT: v_bfe_u32 v2, v2, v3, 16
+; SI-NEXT: v_alignbit_b32 v1, v4, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v3i16:
@@ -1422,26 +1580,32 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; SI-LABEL: v_fshr_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_or_b32_e32 v9, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9
-; SI-NEXT: v_or_b32_e32 v5, 16, v8
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5
-; SI-NEXT: v_or_b32_e32 v4, 16, v11
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4
-; SI-NEXT: v_or_b32_e32 v5, 16, v10
-; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_alignbit_b32 v2, v2, v6, v5
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v4
+; SI-NEXT: v_and_b32_e32 v4, 15, v8
+; SI-NEXT: v_bfe_u32 v0, v0, v4, 16
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_or_b32_e32 v1, v1, v4
+; SI-NEXT: v_and_b32_e32 v4, 15, v9
+; SI-NEXT: v_lshrrev_b32_e32 v1, v4, v1
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v2, v4
+; SI-NEXT: v_and_b32_e32 v4, 15, v10
+; SI-NEXT: v_bfe_u32 v2, v2, v4, 16
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v3, v4
+; SI-NEXT: v_and_b32_e32 v4, 15, v11
+; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v3
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v2, v2, v5
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_bfe_u32 v3, v3, v4, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v4i16:
@@ -1882,9 +2046,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
; SI-LABEL: v_fshr_i24:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v3, 0xffffff, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; SI-NEXT: s_mov_b32 s4, 0xaaaaaab
-; SI-NEXT: v_mul_hi_u32 v3, v3, s4
+; SI-NEXT: v_mul_hi_u32 v3, v2, s4
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v3
; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
@@ -1895,9 +2059,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
; VI-LABEL: v_fshr_i24:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v3, 0xffffff, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; VI-NEXT: s_mov_b32 s4, 0xaaaaaab
-; VI-NEXT: v_mul_hi_u32 v3, v3, s4
+; VI-NEXT: v_mul_hi_u32 v3, v2, s4
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v3
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
@@ -1908,9 +2072,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
; GFX9-LABEL: v_fshr_i24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab
-; GFX9-NEXT: v_mul_hi_u32 v3, v3, s4
+; GFX9-NEXT: v_mul_hi_u32 v3, v2, s4
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v3
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3
@@ -1926,9 +2090,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
; GFX10-LABEL: v_fshr_i24:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3
+; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v2
; GFX10-NEXT: v_mul_u32_u24_e32 v3, 24, v3
; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3
; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2
@@ -1938,25 +2102,26 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
; GFX11-TRUE16-LABEL: v_fshr_i24:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3
-; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v2
+; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 8, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v1, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v3, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_fshr_i24:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3
+; GFX11-FAKE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v2
; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v2, v2, v3
@@ -1972,16 +2137,17 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffffff, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3
-; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v2
+; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v2, v2, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v1
; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v1, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v3, v1.l
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_fshr_i24:
@@ -1991,10 +2157,10 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffffff, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3
+; GFX12-FAKE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v2
; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v2, v2, v3
@@ -2096,9 +2262,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
; GFX11-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -2107,12 +2271,17 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v6, v2.l
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v7, v3.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_fshr_v2i24:
@@ -2148,9 +2317,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4
; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
; GFX12-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -2159,12 +2326,17 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6
; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4
; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v6, v2.l
+; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v7, v3.l
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_fshr_v2i24:
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index f8ff8efbb1ef1..0a18a393cb53a 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -4773,9 +4773,10 @@ define void @void_func_v2bf16(<2 x bfloat> %arg0) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -4807,9 +4808,10 @@ define void @void_func_v3bf16(<3 x bfloat> %arg0) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: v_mul_f32_e32 v1, 1.0, v2
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; CI-NEXT: s_mov_b32 s7, 0xf000
@@ -4847,13 +4849,15 @@ define void @void_func_v4bf16(<4 x bfloat> %arg0) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; CI-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_or_b32_e32 v1, v0, v1
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0
@@ -4885,21 +4889,25 @@ define void @void_func_v8bf16(<8 x bfloat> %arg0) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; CI-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; CI-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; CI-NEXT: v_mul_f32_e32 v5, 1.0, v5
; CI-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; CI-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; CI-NEXT: v_alignbit_b32 v4, v3, v2, 16
-; CI-NEXT: v_alignbit_b32 v3, v1, v0, 16
+; CI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_or_b32_e32 v6, v6, v7
+; CI-NEXT: v_or_b32_e32 v5, v4, v5
+; CI-NEXT: v_or_b32_e32 v4, v2, v3
+; CI-NEXT: v_or_b32_e32 v3, v0, v1
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
@@ -4931,39 +4939,47 @@ define void @void_func_v16bf16(<16 x bfloat> %arg0) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; CI-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; CI-NEXT: v_alignbit_b32 v4, v3, v2, 16
-; CI-NEXT: v_alignbit_b32 v3, v1, v0, 16
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v15
+; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_or_b32_e32 v5, v4, v5
+; CI-NEXT: v_or_b32_e32 v4, v2, v3
+; CI-NEXT: v_or_b32_e32 v3, v0, v1
+; CI-NEXT: v_mul_f32_e32 v0, 1.0, v15
; CI-NEXT: v_mul_f32_e32 v1, 1.0, v14
-; CI-NEXT: v_alignbit_b32 v14, v0, v1, 16
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v14, v1, v0
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v13
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; CI-NEXT: v_mul_f32_e32 v1, 1.0, v12
-; CI-NEXT: v_alignbit_b32 v13, v0, v1, 16
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v13, v1, v0
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; CI-NEXT: v_mul_f32_e32 v1, 1.0, v10
-; CI-NEXT: v_alignbit_b32 v12, v0, v1, 16
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_or_b32_e32 v12, v1, v0
; CI-NEXT: v_mul_f32_e32 v0, 1.0, v9
-; CI-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; CI-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; CI-NEXT: v_mul_f32_e32 v7, 1.0, v7
; CI-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; CI-NEXT: v_alignbit_b32 v11, v0, v1, 16
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; CI-NEXT: v_or_b32_e32 v11, v1, v0
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; CI-NEXT: v_or_b32_e32 v6, v6, v7
; CI-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0
; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 1f74fbdc46e98..86ec634e029ff 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -20632,30 +20632,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -20686,16 +20686,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -20731,17 +20732,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -21013,30 +21015,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -21066,16 +21068,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -21111,17 +21114,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -21394,30 +21398,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -21451,16 +21455,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v1
; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
@@ -21498,17 +21503,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX6-NEXT: .LBB80_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v1
; GFX6-NEXT: v_mov_b32_e32 v6, v0
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
@@ -21770,29 +21776,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -21823,16 +21829,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -21866,17 +21873,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB81_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -22139,29 +22147,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -22192,16 +22200,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -22235,17 +22244,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB82_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -22509,29 +22519,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -22566,16 +22576,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -22613,17 +22624,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -22897,30 +22909,30 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -22950,16 +22962,17 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -22995,17 +23008,18 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -23274,29 +23288,29 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -23327,16 +23341,17 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -23370,17 +23385,18 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -23649,30 +23665,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -23703,16 +23719,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -23748,17 +23765,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -24021,29 +24039,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -24074,16 +24092,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -24117,17 +24136,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -24395,30 +24415,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -24449,16 +24469,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -24494,17 +24515,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -24767,29 +24789,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -24820,16 +24842,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -24863,17 +24886,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -25141,30 +25165,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -25195,16 +25219,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -25240,17 +25265,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -25513,29 +25539,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -25566,16 +25592,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -25609,17 +25636,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_add_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index faa74fef2be2f..32d0d679d85cc 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -16338,30 +16338,30 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_max_f32_e32 v3, v3, v4
; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -16382,26 +16382,27 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_max_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -16427,27 +16428,28 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_max_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_max_f32_e32 v7, v7, v5
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT: v_max_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -16844,30 +16846,30 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -16887,26 +16889,27 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_max_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -16932,27 +16935,28 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_max_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_max_f32_e32 v7, v7, v5
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT: v_max_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -17350,30 +17354,30 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17394,8 +17398,8 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
@@ -17407,16 +17411,17 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v1
; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
@@ -17441,8 +17446,8 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
@@ -17454,17 +17459,18 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_max_f32_e32 v7, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v1
; GFX6-NEXT: v_mov_b32_e32 v6, v0
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
@@ -17847,29 +17853,29 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17889,36 +17895,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB57_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17932,37 +17939,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX6-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB57_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18337,29 +18345,29 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18379,36 +18387,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB58_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18422,37 +18431,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX6-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB58_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18828,29 +18838,29 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18870,40 +18880,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB59_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18917,41 +18928,42 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX6-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB59_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19342,30 +19354,30 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: v_max_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19385,26 +19397,27 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_max_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -19430,27 +19443,28 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_max_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_max_f32_e32 v7, v7, v5
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT: v_max_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -19841,29 +19855,29 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_max_f32_e32 v2, v2, v4
; GFX8-NEXT: v_max_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19883,36 +19897,37 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB61_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19926,37 +19941,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_max_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_max_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX6-NEXT: v_max_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB61_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index a46b0129b79e6..4065a833a89f5 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -16338,30 +16338,30 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_min_f32_e32 v3, v3, v4
; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -16382,26 +16382,27 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_min_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -16427,27 +16428,28 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_min_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_min_f32_e32 v7, v7, v5
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT: v_min_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -16844,30 +16846,30 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -16887,26 +16889,27 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_min_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -16932,27 +16935,28 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_min_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_min_f32_e32 v7, v7, v5
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT: v_min_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -17350,30 +17354,30 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17394,8 +17398,8 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
@@ -17407,16 +17411,17 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v1
; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
@@ -17441,8 +17446,8 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
@@ -17454,17 +17459,18 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_min_f32_e32 v7, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v1
; GFX6-NEXT: v_mov_b32_e32 v6, v0
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
@@ -17847,29 +17853,29 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17889,36 +17895,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB57_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -17932,37 +17939,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX6-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB57_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18337,29 +18345,29 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18379,36 +18387,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB58_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18422,37 +18431,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX6-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB58_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18828,29 +18838,29 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18870,40 +18880,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB59_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -18917,41 +18928,42 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX6-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB59_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19342,30 +19354,30 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_min_f32_e32 v0, v0, v1
; GFX8-NEXT: v_min_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19385,26 +19397,27 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_min_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -19430,27 +19443,28 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_min_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_min_f32_e32 v7, v7, v5
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT: v_min_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -19841,29 +19855,29 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_min_f32_e32 v2, v2, v4
; GFX8-NEXT: v_min_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19883,36 +19897,37 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB61_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -19926,37 +19941,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_min_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_min_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX6-NEXT: v_min_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB61_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 053efdcb76261..8da0f9e68c718 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -16699,30 +16699,30 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -16753,16 +16753,17 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -16798,17 +16799,18 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_sub_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -17205,30 +17207,30 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17258,16 +17260,17 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -17303,17 +17306,18 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_sub_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -17711,30 +17715,30 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -17768,16 +17772,17 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v1
; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
@@ -17815,17 +17820,18 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX6-NEXT: v_sub_f32_e32 v7, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v1
; GFX6-NEXT: v_mov_b32_e32 v6, v0
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
@@ -18208,29 +18214,29 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18261,16 +18267,17 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -18304,17 +18311,18 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -18698,29 +18706,29 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -18751,16 +18759,17 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -18794,17 +18803,18 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -19189,29 +19199,29 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19246,16 +19256,17 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -19293,17 +19304,18 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
@@ -19703,30 +19715,30 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -19756,16 +19768,17 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -19801,17 +19814,18 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_sub_f32_e32 v7, v2, v5
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -20202,29 +20216,29 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -20255,16 +20269,17 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
@@ -20298,17 +20313,18 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_sub_f32_e32 v7, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 7ebd69204d87f..ab2618863da2a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -2253,40 +2253,36 @@ entry:
define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-LABEL: udot4_acc16_vecMul:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, s3
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, s7
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_mov_b32 s2, -1
-; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
+; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, -1
+; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0
+; GFX7-NEXT: v_bfe_u32 v3, v2, 16, 8
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
-; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8
-; GFX7-NEXT: v_alignbit_b32 v0, v6, v0, 16
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8
+; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v5, v0
-; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0
+; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: udot4_acc16_vecMul:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index c947d6976a95f..86bed9c73c1d3 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -238,11 +238,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_lshr_b32 s0, s4, 16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16
+; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
+; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -256,11 +257,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_lshr_b32 s1, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s0, s2, 16
-; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16
+; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
+; CI-NEXT: s_or_b32 s0, s1, s0
+; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -312,13 +314,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_lshr_b32 s0, s4, 16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16
+; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
+; VI-NEXT: s_or_b32 s1, s0, s1
+; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: ;;#ASMSTART
; VI-NEXT: ; use s0
@@ -334,13 +336,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshr_b32 s0, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s1, s2, 16
-; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16
+; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
+; CI-NEXT: s_or_b32 s1, s0, s1
+; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
; CI-NEXT: ; use s0
@@ -405,13 +407,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_lshr_b32 s0, s4, 16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16
+; VI-NEXT: s_and_b32 s2, s2, 0xffff0000
+; VI-NEXT: s_or_b32 s2, s0, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: ;;#ASMSTART
; VI-NEXT: ; use s0
@@ -430,19 +433,20 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshr_b32 s0, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s1, s2, 16
-; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16
+; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
+; CI-NEXT: s_or_b32 s1, s0, s1
+; CI-NEXT: v_mov_b32_e32 v2, s1
+; CI-NEXT: s_lshr_b32 s2, s2, 16
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
; CI-NEXT: ; use s0
; CI-NEXT: ;;#ASMEND
; CI-NEXT: ;;#ASMSTART
-; CI-NEXT: ; use s1
+; CI-NEXT: ; use s2
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
;
@@ -825,10 +829,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; CI-NEXT: s_lshr_b32 s0, s4, 16
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; CI-NEXT: v_alignbit_b32 v2, v2, s4, 16
+; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; CI-NEXT: v_or_b32_e32 v2, s0, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
index e1f84dcbaa607..234c8f229ff34 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
@@ -40,9 +40,10 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf
; GFX7-LABEL: buffer_store_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX7-NEXT: s_endpgm
;
@@ -73,13 +74,15 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf
; GFX7-LABEL: buffer_store_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v1, v0, v1
; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen
; GFX7-NEXT: s_endpgm
;
@@ -116,21 +119,25 @@ define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bf
; GFX7-LABEL: buffer_store_v8bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_or_b32_e32 v5, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v0, v1
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen
; GFX7-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
index de1f859132e61..c71e7cbddaaf1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
@@ -421,9 +421,10 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf
; VERDE-LABEL: buffer_store_v2bf16:
; VERDE: ; %bb.0:
; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; VERDE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VERDE-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; VERDE-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; VERDE-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VERDE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VERDE-NEXT: v_or_b32_e32 v0, v0, v1
; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; VERDE-NEXT: s_endpgm
;
@@ -439,13 +440,15 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf
; VERDE-LABEL: buffer_store_v4bf16:
; VERDE: ; %bb.0:
; VERDE-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; VERDE-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; VERDE-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; VERDE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1
; VERDE-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; VERDE-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; VERDE-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; VERDE-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; VERDE-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VERDE-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VERDE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VERDE-NEXT: v_or_b32_e32 v2, v2, v3
+; VERDE-NEXT: v_or_b32_e32 v1, v0, v1
; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen
; VERDE-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index b1bdfa667f57e..6e564e39b8cd4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -9771,19 +9771,20 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX6-NOHSA: ; %bb.0:
; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_and_b32 s5, s4, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s4, 24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s6, v0, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s5, 8
-; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
+; GFX6-NOHSA-NEXT: s_and_b32 s4, s2, 0xff00
+; GFX6-NOHSA-NEXT: s_lshr_b32 s5, s2, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s2, 0x80010
+; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff
+; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000
+; GFX6-NOHSA-NEXT: s_lshl_b32 s4, s4, 8
+; GFX6-NOHSA-NEXT: s_or_b32 s5, s6, s5
+; GFX6-NOHSA-NEXT: s_or_b32 s4, s2, s4
+; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NOHSA-NEXT: s_endpgm
;
@@ -9799,14 +9800,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00
-; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 8
+; GFX7-HSA-NEXT: s_bfe_u32 s3, s2, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff
+; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000
; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8
-; GFX7-HSA-NEXT: v_alignbit_b32 v2, s1, v2, 16
+; GFX7-HSA-NEXT: s_or_b32 s1, s3, s1
; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0
-; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1
; GFX7-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-HSA-NEXT: s_endpgm
;
@@ -9819,14 +9821,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s3, s2, 0xff
; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v2, s0, v2, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff0000
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000
; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
-; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: s_or_b32 s1, s3, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10001,26 +10004,28 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s4, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s8, s5, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NOHSA-NEXT: s_and_b32 s2, s4, 0xff00
+; GFX6-NOHSA-NEXT: s_and_b32 s6, s5, 0xff00
+; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s5, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s8, s5, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s4, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s4, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s9, v0, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16
+; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000
; GFX6-NOHSA-NEXT: s_lshl_b32 s6, s6, 8
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
-; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s8
-; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s6
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
+; GFX6-NOHSA-NEXT: s_and_b32 s9, s9, 0xff0000
+; GFX6-NOHSA-NEXT: s_lshl_b32 s2, s2, 8
+; GFX6-NOHSA-NEXT: s_or_b32 s7, s8, s7
+; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s6
+; GFX6-NOHSA-NEXT: s_or_b32 s6, s10, s9
+; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s2
+; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NOHSA-NEXT: s_endpgm
;
@@ -10032,27 +10037,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s5, v0, 16
-; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00
-; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24
-; GFX7-HSA-NEXT: s_and_b32 s4, s3, 0xff00
-; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-HSA-NEXT: s_and_b32 s1, s3, 0xff00
+; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 8
+; GFX7-HSA-NEXT: s_bfe_u32 s5, s3, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s4, s4, 8
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s1, v0, 16
-; GFX7-HSA-NEXT: s_and_b32 s1, s2, 0xff
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s1, 8
+; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00
+; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff0000
+; GFX7-HSA-NEXT: s_or_b32 s1, s3, s1
+; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 8
+; GFX7-HSA-NEXT: s_or_b32 s4, s5, s4
+; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff0000
+; GFX7-HSA-NEXT: s_bfe_u32 s5, s2, 0x80010
+; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff
; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8
-; GFX7-HSA-NEXT: s_or_b32 s3, s3, s4
-; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0
-; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
+; GFX7-HSA-NEXT: s_or_b32 s3, s5, s3
+; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s4
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
;
@@ -10064,25 +10071,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s5, s3, 0xff
+; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s4, s3, 0xff
; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 8
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s2, 24
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s3
+; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s5, 16
+; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_or_b32 s3, s4, s3
+; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8
; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000
-; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s1
-; GFX8-NOHSA-NEXT: s_or_b32 s3, s5, s3
-; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s2
-; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: s_or_b32 s2, s4, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10322,42 +10330,47 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NOHSA-NEXT: s_and_b32 s8, s6, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s6, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s10, s7, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s12, s4, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s4, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s14, s5, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NOHSA-NEXT: s_and_b32 s9, s7, 0xff00
+; GFX6-NOHSA-NEXT: s_and_b32 s10, s4, 0xff00
+; GFX6-NOHSA-NEXT: s_and_b32 s11, s5, 0xff00
+; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s5, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s4, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s7, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s17, s7, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s19, s6, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s15, v0, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s11, v2, 16
+; GFX6-NOHSA-NEXT: s_and_b32 s12, s12, 0xff0000
+; GFX6-NOHSA-NEXT: s_lshl_b32 s11, s11, 8
+; GFX6-NOHSA-NEXT: s_and_b32 s14, s14, 0xff0000
; GFX6-NOHSA-NEXT: s_lshl_b32 s10, s10, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s9, v3, 16
+; GFX6-NOHSA-NEXT: s_and_b32 s16, s16, 0xff0000
+; GFX6-NOHSA-NEXT: s_lshl_b32 s9, s9, 8
+; GFX6-NOHSA-NEXT: s_and_b32 s18, s18, 0xff0000
; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
-; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s14
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
-; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s12
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2
-; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s10
+; GFX6-NOHSA-NEXT: s_or_b32 s12, s13, s12
+; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s11
+; GFX6-NOHSA-NEXT: s_or_b32 s11, s15, s14
+; GFX6-NOHSA-NEXT: s_or_b32 s13, s17, s16
+; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s9
+; GFX6-NOHSA-NEXT: s_or_b32 s9, s19, s18
; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s8
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v4
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s10
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s13
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s12
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NOHSA-NEXT: s_endpgm
;
@@ -10365,53 +10378,57 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
-; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s11, s4, 24
-; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s9, s7, 24
-; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s9, v0, 16
-; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00
-; GFX7-HSA-NEXT: s_lshr_b32 s3, s6, 24
-; GFX7-HSA-NEXT: s_and_b32 s8, s7, 0xff00
-; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00
-; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00
-; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX7-HSA-NEXT: s_and_b32 s9, s5, 0xff00
+; GFX7-HSA-NEXT: s_lshr_b32 s10, s5, 8
+; GFX7-HSA-NEXT: s_bfe_u32 s11, s5, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8
+; GFX7-HSA-NEXT: s_lshl_b32 s9, s9, 8
+; GFX7-HSA-NEXT: s_and_b32 s8, s4, 0xff00
+; GFX7-HSA-NEXT: s_and_b32 s10, s10, 0xff0000
+; GFX7-HSA-NEXT: s_or_b32 s5, s5, s9
+; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 8
+; GFX7-HSA-NEXT: s_and_b32 s3, s7, 0xff00
+; GFX7-HSA-NEXT: s_or_b32 s10, s11, s10
+; GFX7-HSA-NEXT: s_and_b32 s9, s9, 0xff0000
+; GFX7-HSA-NEXT: s_bfe_u32 s11, s4, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8
-; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff
; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s3, v0, 16
-; GFX7-HSA-NEXT: s_and_b32 s3, s6, 0xff
+; GFX7-HSA-NEXT: s_or_b32 s9, s11, s9
+; GFX7-HSA-NEXT: s_or_b32 s4, s4, s8
+; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 8
+; GFX7-HSA-NEXT: s_bfe_u32 s11, s7, 0x80010
+; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff
+; GFX7-HSA-NEXT: s_lshl_b32 s3, s3, 8
+; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00
+; GFX7-HSA-NEXT: s_and_b32 s8, s8, 0xff0000
+; GFX7-HSA-NEXT: s_or_b32 s3, s7, s3
+; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 8
+; GFX7-HSA-NEXT: s_or_b32 s8, s11, s8
+; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff0000
+; GFX7-HSA-NEXT: s_bfe_u32 s11, s6, 0x80010
+; GFX7-HSA-NEXT: s_and_b32 s6, s6, 0xff
; GFX7-HSA-NEXT: s_lshl_b32 s2, s2, 8
-; GFX7-HSA-NEXT: s_or_b32 s5, s5, s12
-; GFX7-HSA-NEXT: s_or_b32 s4, s4, s10
-; GFX7-HSA-NEXT: s_or_b32 s7, s7, s8
-; GFX7-HSA-NEXT: s_or_b32 s2, s3, s2
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT: s_or_b32 s7, s11, s7
+; GFX7-HSA-NEXT: s_or_b32 s2, s6, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3
-; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s7
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
@@ -10422,50 +10439,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s3, v0, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 8
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000
-; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16
-; GFX8-NOHSA-NEXT: s_or_b32 s4, s3, s4
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24
-; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8
-; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
-; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s7, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8
-; GFX8-NOHSA-NEXT: s_or_b32 s3, s9, s3
-; GFX8-NOHSA-NEXT: s_and_b32 s9, s7, 0xff
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s5, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s5, 0x80010
+; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NOHSA-NEXT: s_or_b32 s11, s3, s2
+; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s4, 8
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s4, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s4, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s4, 0x80010
+; GFX8-NOHSA-NEXT: s_or_b32 s4, s2, s3
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24
+; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010
+; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2
+; GFX8-NOHSA-NEXT: s_and_b32 s3, s7, 0xff
; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s2, v0, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8
-; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000
+; GFX8-NOHSA-NEXT: s_and_b32 s8, s5, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8
; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000
-; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000
-; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5
+; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000
+; GFX8-NOHSA-NEXT: s_lshl_b32 s9, s9, 16
+; GFX8-NOHSA-NEXT: s_or_b32 s3, s3, s7
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s6, 24
+; GFX8-NOHSA-NEXT: s_or_b32 s5, s8, s5
+; GFX8-NOHSA-NEXT: s_or_b32 s8, s10, s9
+; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 16
+; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s6, 0x80010
; GFX8-NOHSA-NEXT: s_or_b32 s7, s9, s7
-; GFX8-NOHSA-NEXT: s_or_b32 s2, s2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: s_and_b32 s9, s6, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8
+; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000
+; GFX8-NOHSA-NEXT: s_or_b32 s6, s9, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -10850,80 +10869,91 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NOHSA-NEXT: s_and_b32 s12, s6, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s6, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s4, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s5, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s20, s2, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s2, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s22, s3, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s3, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s24, s0, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s0, 24
-; GFX6-NOHSA-NEXT: s_and_b32 s26, s1, 0xff00
-; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s1, 24
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1
+; GFX6-NOHSA-NEXT: s_and_b32 s13, s7, 0xff00
+; GFX6-NOHSA-NEXT: s_and_b32 s14, s4, 0xff00
+; GFX6-NOHSA-NEXT: s_and_b32 s15, s5, 0xff00
+; GFX6-NOHSA-NEXT: s_and_b32 s16, s2, 0xff00
+; GFX6-NOHSA-NEXT: s_and_b32 s17, s3, 0xff00
+; GFX6-NOHSA-NEXT: s_and_b32 s18, s0, 0xff00
+; GFX6-NOHSA-NEXT: s_and_b32 s19, s1, 0xff00
+; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s1, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s21, s1, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s0, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s0, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s3, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s2
+; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s2, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s5
+; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s29, s5, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4
+; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s31, s4, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7
+; GFX6-NOHSA-NEXT: s_lshr_b32 s33, s7, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s34, s7, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s6
+; GFX6-NOHSA-NEXT: s_lshr_b32 s35, s6, 8
+; GFX6-NOHSA-NEXT: s_bfe_u32 s36, s6, 0x80010
; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s27, v0, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s25, v1, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s23, v2, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s22, s22, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v8, s21, v3, 16
-; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s19, v4, 16
+; GFX6-NOHSA-NEXT: s_and_b32 s20, s20, 0xff0000
+; GFX6-NOHSA-NEXT: s_lshl_b32 s19, s19, 8
+; GFX6-NOHSA-NEXT: s_and_b32 s22, s22, 0xff0000
; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v9, s17, v5, 16
+; GFX6-NOHSA-NEXT: s_and_b32 s24, s24, 0xff0000
+; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s17, 8
+; GFX6-NOHSA-NEXT: s_and_b32 s26, s26, 0xff0000
; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v6, s15, v6, 16
+; GFX6-NOHSA-NEXT: s_and_b32 s28, s28, 0xff0000
+; GFX6-NOHSA-NEXT: s_lshl_b32 s15, s15, 8
+; GFX6-NOHSA-NEXT: s_and_b32 s30, s30, 0xff0000
; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8
-; GFX6-NOHSA-NEXT: v_alignbit_b32 v10, s13, v7, 16
+; GFX6-NOHSA-NEXT: s_and_b32 s33, s33, 0xff0000
+; GFX6-NOHSA-NEXT: s_lshl_b32 s13, s13, 8
+; GFX6-NOHSA-NEXT: s_and_b32 s35, s35, 0xff0000
; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
-; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s26
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
-; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s24
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2
-; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s22
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v8
-; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s20
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v4
-; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s18
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v9
-; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s16
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v6
-; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s14
+; GFX6-NOHSA-NEXT: s_or_b32 s20, s21, s20
+; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s19
+; GFX6-NOHSA-NEXT: s_or_b32 s19, s23, s22
+; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s18
+; GFX6-NOHSA-NEXT: s_or_b32 s18, s25, s24
+; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s17
+; GFX6-NOHSA-NEXT: s_or_b32 s17, s27, s26
+; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s16
+; GFX6-NOHSA-NEXT: s_or_b32 s16, s29, s28
+; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s15
+; GFX6-NOHSA-NEXT: s_or_b32 s15, s31, s30
+; GFX6-NOHSA-NEXT: s_or_b32 s21, s34, s33
+; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s13
+; GFX6-NOHSA-NEXT: s_or_b32 s13, s36, s35
; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s12
-; GFX6-NOHSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v10
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s7
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3
-; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s14
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s16
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18
+; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; GFX6-NOHSA-NEXT: s_endpgm
;
@@ -10936,94 +10966,102 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_lshr_b32 s25, s1, 24
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s25, v0, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s23, s0, 24
-; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s23, v0, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s21, s3, 24
-; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s21, v0, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s19, s2, 24
-; GFX7-HSA-NEXT: s_and_b32 s24, s1, 0xff00
-; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-HSA-NEXT: s_and_b32 s22, s0, 0xff00
+; GFX7-HSA-NEXT: s_and_b32 s16, s1, 0xff00
+; GFX7-HSA-NEXT: s_lshr_b32 s17, s1, 8
+; GFX7-HSA-NEXT: s_bfe_u32 s18, s1, 0x80010
; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s24, s24, 8
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s19, v0, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s17, s5, 24
-; GFX7-HSA-NEXT: s_and_b32 s20, s3, 0xff00
-; GFX7-HSA-NEXT: s_or_b32 s24, s1, s24
+; GFX7-HSA-NEXT: s_lshl_b32 s16, s16, 8
+; GFX7-HSA-NEXT: s_and_b32 s17, s17, 0xff0000
+; GFX7-HSA-NEXT: s_or_b32 s16, s1, s16
+; GFX7-HSA-NEXT: s_lshr_b32 s1, s0, 8
+; GFX7-HSA-NEXT: s_and_b32 s15, s0, 0xff00
+; GFX7-HSA-NEXT: s_or_b32 s17, s18, s17
+; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000
+; GFX7-HSA-NEXT: s_bfe_u32 s18, s0, 0x80010
+; GFX7-HSA-NEXT: s_or_b32 s18, s18, s1
; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s22, 8
-; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
-; GFX7-HSA-NEXT: s_and_b32 s18, s2, 0xff00
-; GFX7-HSA-NEXT: s_or_b32 s22, s0, s1
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s15, 8
+; GFX7-HSA-NEXT: s_or_b32 s15, s0, s1
+; GFX7-HSA-NEXT: s_lshr_b32 s0, s3, 8
+; GFX7-HSA-NEXT: s_and_b32 s14, s3, 0xff00
+; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000
+; GFX7-HSA-NEXT: s_bfe_u32 s1, s3, 0x80010
+; GFX7-HSA-NEXT: s_or_b32 s19, s1, s0
; GFX7-HSA-NEXT: s_and_b32 s0, s3, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 8
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s17, v0, 16
-; GFX7-HSA-NEXT: s_lshr_b32 s15, s4, 24
-; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff00
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8
; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1
+; GFX7-HSA-NEXT: s_lshr_b32 s0, s2, 8
+; GFX7-HSA-NEXT: s_and_b32 s13, s2, 0xff00
+; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000
+; GFX7-HSA-NEXT: s_bfe_u32 s1, s2, 0x80010
+; GFX7-HSA-NEXT: s_or_b32 s14, s1, s0
; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s18, 8
-; GFX7-HSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff00
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s13, 8
; GFX7-HSA-NEXT: s_or_b32 s2, s0, s1
+; GFX7-HSA-NEXT: s_lshr_b32 s0, s5, 8
+; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00
+; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000
+; GFX7-HSA-NEXT: s_bfe_u32 s1, s5, 0x80010
+; GFX7-HSA-NEXT: s_or_b32 s13, s1, s0
; GFX7-HSA-NEXT: s_and_b32 s0, s5, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 8
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s15, v0, 16
-; GFX7-HSA-NEXT: s_and_b32 s12, s7, 0xff00
-; GFX7-HSA-NEXT: s_lshr_b32 s13, s7, 24
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8
; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1
-; GFX7-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v0
+; GFX7-HSA-NEXT: s_lshr_b32 s0, s4, 8
+; GFX7-HSA-NEXT: s_and_b32 s11, s4, 0xff00
+; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000
+; GFX7-HSA-NEXT: s_bfe_u32 s1, s4, 0x80010
+; GFX7-HSA-NEXT: s_or_b32 s12, s1, s0
; GFX7-HSA-NEXT: s_and_b32 s0, s4, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
-; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00
+; GFX7-HSA-NEXT: s_lshl_b32 s1, s11, 8
; GFX7-HSA-NEXT: s_or_b32 s4, s0, s1
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16
-; GFX7-HSA-NEXT: s_and_b32 s0, s7, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8
-; GFX7-HSA-NEXT: s_lshr_b32 s11, s6, 24
-; GFX7-HSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v0
-; GFX7-HSA-NEXT: s_or_b32 s0, s0, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-HSA-NEXT: s_and_b32 s1, s6, 0xff
-; GFX7-HSA-NEXT: s_lshl_b32 s6, s10, 8
-; GFX7-HSA-NEXT: s_or_b32 s1, s1, s6
-; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0
+; GFX7-HSA-NEXT: s_lshr_b32 s0, s7, 8
+; GFX7-HSA-NEXT: s_and_b32 s10, s7, 0xff00
+; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000
+; GFX7-HSA-NEXT: s_bfe_u32 s1, s7, 0x80010
+; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0
+; GFX7-HSA-NEXT: s_and_b32 s1, s7, 0xff
+; GFX7-HSA-NEXT: s_lshl_b32 s7, s10, 8
+; GFX7-HSA-NEXT: s_or_b32 s1, s1, s7
+; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 8
+; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff0000
+; GFX7-HSA-NEXT: s_bfe_u32 s10, s6, 0x80010
+; GFX7-HSA-NEXT: s_or_b32 s7, s10, s7
+; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00
+; GFX7-HSA-NEXT: s_and_b32 s6, s6, 0xff
+; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8
+; GFX7-HSA-NEXT: s_or_b32 s6, s6, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48
-; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1
; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1
-; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32
-; GFX7-HSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7
; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4
-; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16
-; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13
; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s18
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX7-HSA-NEXT: s_endpgm
@@ -11034,51 +11072,53 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s1, 24
-; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s1, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s16, s1, 0xff
+; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s12, s1, 0xff
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NOHSA-NEXT: s_lshl_b32 s14, s14, 16
+; GFX8-NOHSA-NEXT: s_lshl_b32 s10, s10, 16
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24
-; GFX8-NOHSA-NEXT: s_or_b32 s14, s15, s14
-; GFX8-NOHSA-NEXT: s_or_b32 s15, s16, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_or_b32 s10, s11, s10
+; GFX8-NOHSA-NEXT: s_or_b32 s11, s12, s1
; GFX8-NOHSA-NEXT: s_and_b32 s1, s0, 0xff
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8
+; GFX8-NOHSA-NEXT: s_lshl_b32 s13, s13, 16
; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff0000
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s13, v0, 16
+; GFX8-NOHSA-NEXT: s_or_b32 s12, s14, s13
; GFX8-NOHSA-NEXT: s_or_b32 s13, s1, s0
; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010
-; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s0
+; GFX8-NOHSA-NEXT: s_or_b32 s14, s1, s0
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s3, 8
; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
; GFX8-NOHSA-NEXT: s_or_b32 s3, s0, s1
+; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_or_b32 s15, s1, s0
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s2, 8
; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
-; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24
-; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NOHSA-NEXT: s_or_b32 s2, s0, s1
; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s5, 24
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s5, 0x80010
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s12, v0, 16
-; GFX8-NOHSA-NEXT: s_or_b32 s12, s1, s0
+; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s0
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s5, 8
; GFX8-NOHSA-NEXT: s_and_b32 s0, s5, 0xff
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
; GFX8-NOHSA-NEXT: s_or_b32 s5, s0, s1
+; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s4, 24
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s4, 0x80010
+; GFX8-NOHSA-NEXT: s_or_b32 s17, s1, s0
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s4, 8
; GFX8-NOHSA-NEXT: s_and_b32 s0, s4, 0xff
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
-; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24
-; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: s_or_b32 s4, s0, s1
; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s7, 24
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
@@ -11086,46 +11126,48 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
; GFX8-NOHSA-NEXT: s_and_b32 s1, s7, 0xff
; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s11, v0, 16
; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24
-; GFX8-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0
; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s6, 0xff
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s6, 24
+; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 16
+; GFX8-NOHSA-NEXT: s_bfe_u32 s18, s6, 0x80010
+; GFX8-NOHSA-NEXT: s_or_b32 s7, s18, s7
+; GFX8-NOHSA-NEXT: s_and_b32 s18, s6, 0xff
; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8
; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000
-; GFX8-NOHSA-NEXT: s_or_b32 s6, s7, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0
+; GFX8-NOHSA-NEXT: s_or_b32 s6, s18, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s10, v0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32
-; GFX8-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s16
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index 5bc02c4d63181..b4029d57523ba 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -9830,12 +9830,13 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00, v0
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff, v0
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v3, v2
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
+; GCN-NOHSA-SI-NEXT: v_bfe_u32 v3, v0, 16, 8
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff0000, v2
+; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
+; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v0, v4
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
@@ -9853,12 +9854,13 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00, v2
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 24, v2
-; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v2
-; GCN-HSA-NEXT: v_alignbit_b32 v2, v4, v2, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v3
-; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2
-; GCN-HSA-NEXT: v_or_b32_e32 v2, v5, v4
+; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 8, v2
+; GCN-HSA-NEXT: v_bfe_u32 v5, v2, 16, 8
+; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff0000, v4
+; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v3
+; GCN-HSA-NEXT: v_or_b32_e32 v3, v5, v4
+; GCN-HSA-NEXT: v_or_b32_e32 v2, v2, v6
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-HSA-NEXT: s_endpgm
;
@@ -9878,9 +9880,9 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0
-; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xff0000, v2
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
+; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -10063,33 +10065,41 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(ptr addrspace(1) %out,
define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v8i8_to_v8i16:
; GCN-NOHSA-SI: ; %bb.0:
-; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v1
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v1
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v1
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v5, v1, 16
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v3, v0, 16
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v6, v4
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v7, v5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s1, v1
+; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s0, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s1, 0xff00
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s1, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s9, s1, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s0, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s11, s0, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s7, s7, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s6, s6, 8
+; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s9, s8
+; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s1, s7
+; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s11, s10
+; GCN-NOHSA-SI-NEXT: s_or_b32 s6, s0, s6
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s8
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v8i8_to_v8i16:
@@ -10105,20 +10115,28 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff00, v0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v1
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v1
-; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v0
-; GCN-HSA-NEXT: v_alignbit_b32 v1, v7, v1, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GCN-HSA-NEXT: v_alignbit_b32 v0, v3, v0, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v7, 8, v2
-; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
-; GCN-HSA-NEXT: v_or_b32_e32 v2, v8, v6
-; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-HSA-NEXT: v_or_b32_e32 v0, v9, v7
+; GCN-HSA-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-HSA-NEXT: v_readfirstlane_b32 s1, v1
+; GCN-HSA-NEXT: s_and_b32 s2, s0, 0xff00
+; GCN-HSA-NEXT: s_and_b32 s3, s1, 0xff00
+; GCN-HSA-NEXT: s_lshr_b32 s4, s1, 8
+; GCN-HSA-NEXT: s_lshr_b32 s6, s0, 8
+; GCN-HSA-NEXT: s_bfe_u32 s5, s1, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xff
+; GCN-HSA-NEXT: s_bfe_u32 s7, s0, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xff
+; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff0000
+; GCN-HSA-NEXT: s_lshl_b32 s3, s3, 8
+; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff0000
+; GCN-HSA-NEXT: s_lshl_b32 s2, s2, 8
+; GCN-HSA-NEXT: s_or_b32 s4, s5, s4
+; GCN-HSA-NEXT: s_or_b32 s1, s1, s3
+; GCN-HSA-NEXT: s_or_b32 s3, s7, s6
+; GCN-HSA-NEXT: s_or_b32 s0, s0, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
@@ -10136,23 +10154,28 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_u32 s6, s4, 0x80010
-; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s4, 0xff
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s7, s5, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s5, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s10, s4, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s4, 0xff
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000
-; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v2, v2, v0, 16
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v1
-; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s6, s5
-; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s7, s4
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v2
-; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s7, s6
+; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s5
+; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s9
+; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s11, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -10412,35 +10435,52 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v3
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v3
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff00, v0
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 24, v0
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff00, v1
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v1
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xff, v0
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v3
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v2
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v11, v1, 16
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v9, v0, 16
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v7, v3, 16
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v2, 16
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v12, v10
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v13, v8
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v7
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v14, v6
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v5
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v15, v4
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s4, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s5, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s6, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s7, 0xff00
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s7, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s13, s7, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s6, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s15, s6, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s5, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s17, s5, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s19, s4, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s11, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s10, s10, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s9, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s8, s8, 8
+; GCN-NOHSA-SI-NEXT: s_or_b32 s12, s13, s12
+; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s7, s11
+; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s15, s14
+; GCN-NOHSA-SI-NEXT: s_or_b32 s13, s17, s16
+; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s5, s9
+; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s19, s18
+; GCN-NOHSA-SI-NEXT: s_or_b32 s4, s4, s8
+; GCN-NOHSA-SI-NEXT: s_or_b32 s6, s6, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
@@ -10454,43 +10494,59 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v2
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v2
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3
-; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff00, v0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0
-; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff00, v1
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v1
-; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff, v3
-; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xff, v2
-; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v3, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GCN-HSA-NEXT: v_alignbit_b32 v5, v5, v2, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v1
-; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v0
-; GCN-HSA-NEXT: v_alignbit_b32 v1, v15, v1, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; GCN-HSA-NEXT: v_alignbit_b32 v0, v13, v0, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v12, 8, v12
-; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7
-; GCN-HSA-NEXT: v_or_b32_e32 v6, v18, v6
-; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v5
-; GCN-HSA-NEXT: v_or_b32_e32 v4, v19, v4
-; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
-; GCN-HSA-NEXT: v_or_b32_e32 v2, v16, v14
-; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-HSA-NEXT: v_or_b32_e32 v0, v17, v12
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2
+; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3
+; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0
+; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1
+; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xff00
+; GCN-HSA-NEXT: s_and_b32 s7, s3, 0xff00
+; GCN-HSA-NEXT: s_and_b32 s8, s4, 0xff00
+; GCN-HSA-NEXT: s_and_b32 s9, s5, 0xff00
+; GCN-HSA-NEXT: s_lshr_b32 s10, s5, 8
+; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 8
+; GCN-HSA-NEXT: s_lshr_b32 s14, s3, 8
+; GCN-HSA-NEXT: s_lshr_b32 s16, s2, 8
+; GCN-HSA-NEXT: s_bfe_u32 s11, s5, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xff
+; GCN-HSA-NEXT: s_bfe_u32 s13, s4, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff
+; GCN-HSA-NEXT: s_bfe_u32 s15, s3, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xff
+; GCN-HSA-NEXT: s_bfe_u32 s17, s2, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff
+; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xff0000
+; GCN-HSA-NEXT: s_lshl_b32 s9, s9, 8
+; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xff0000
+; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 8
+; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xff0000
+; GCN-HSA-NEXT: s_lshl_b32 s7, s7, 8
+; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xff0000
+; GCN-HSA-NEXT: s_lshl_b32 s6, s6, 8
+; GCN-HSA-NEXT: s_or_b32 s10, s11, s10
+; GCN-HSA-NEXT: s_or_b32 s5, s5, s9
+; GCN-HSA-NEXT: s_or_b32 s9, s13, s12
+; GCN-HSA-NEXT: s_or_b32 s4, s4, s8
+; GCN-HSA-NEXT: s_or_b32 s8, s15, s14
+; GCN-HSA-NEXT: s_or_b32 s3, s3, s7
+; GCN-HSA-NEXT: s_or_b32 s7, s17, s16
+; GCN-HSA-NEXT: s_or_b32 s2, s2, s6
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s10
+; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i16:
@@ -10507,42 +10563,52 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_u32 s10, s4, 0x80010
-; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s4, 0xff
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 24, v2
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
-; GCN-NOHSA-VI-NEXT: s_bfe_u32 s7, s5, 0x80010
-; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s5, 0xff
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s9, s7, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s7, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s15, s5, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s5, 0xff
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xff0000, v1
-; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v4, v4, v2, 16
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff0000, v5
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s4, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s18, s4, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s4, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s12, s6, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xff0000
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 16
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000
-; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s7, s6
-; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s9
-; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s11, s4
-; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16
-; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v4
-; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s17, s17, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000
+; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000
+; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8
+; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s7
+; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s11
+; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s15, s14
+; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s16, s5
+; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s17
+; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s19, s4
+; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s13, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s8
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v16i8_to_v16i16:
@@ -10928,71 +10994,105 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00, v3
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v3
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v2
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v6, v5
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v7, v4
-; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v3
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v3, v5, v3, 16
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v3
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v2
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00, v10
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v10
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v6
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v7
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v4
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s11, v5
+; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s4, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s5, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s6, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s7, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s8, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s9, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s10, 0xff00
+; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s11, 0xff00
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s11, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s21, s11, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s23, s10, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s9, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s25, s9, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s27, s8, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s29, s7, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s6, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s31, s6, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s5, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s34, s5, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xff
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s4, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_u32 s36, s4, 0x80010
+; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff
+; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s19, s19, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s18, s18, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s24, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s17, s17, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s16, s16, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s28, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s15, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s30, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s14, s14, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s33, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s13, s13, 8
+; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s35, 0xff0000
+; GCN-NOHSA-SI-NEXT: s_lshl_b32 s12, s12, 8
+; GCN-NOHSA-SI-NEXT: s_or_b32 s20, s21, s20
+; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s11, s19
+; GCN-NOHSA-SI-NEXT: s_or_b32 s19, s23, s22
+; GCN-NOHSA-SI-NEXT: s_or_b32 s10, s10, s18
+; GCN-NOHSA-SI-NEXT: s_or_b32 s18, s25, s24
+; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s9, s17
+; GCN-NOHSA-SI-NEXT: s_or_b32 s17, s27, s26
+; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s8, s16
+; GCN-NOHSA-SI-NEXT: s_or_b32 s16, s29, s28
+; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s7, s15
+; GCN-NOHSA-SI-NEXT: s_or_b32 s15, s31, s30
+; GCN-NOHSA-SI-NEXT: s_or_b32 s21, s34, s33
+; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s5, s13
+; GCN-NOHSA-SI-NEXT: s_or_b32 s13, s36, s35
+; GCN-NOHSA-SI-NEXT: s_or_b32 s4, s4, s12
+; GCN-NOHSA-SI-NEXT: s_or_b32 s6, s6, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 24, v11
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v8
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v6, v9, 16
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v12, v4, v11, 16
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v13, v2, v10, 16
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v11
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v8
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v9
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v1
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v14, v14, v1, 16
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v0
-; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v15, v15, v0, 16
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff00, v0
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff00, v1
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v0
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v9, v0
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v8, v6
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v11, v4
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v10, v3
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17
-; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v16
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v10, v1, v3
-; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v8, v18, v8
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v7
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v5
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v12
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v13
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff00ff, v14
-; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff00ff, v15
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s16
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
@@ -11005,88 +11105,120 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v2
+; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v3
+; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-HSA-NEXT: s_and_b32 s8, s4, 0xff00
+; GCN-HSA-NEXT: s_and_b32 s9, s5, 0xff00
+; GCN-HSA-NEXT: s_and_b32 s10, s6, 0xff00
+; GCN-HSA-NEXT: s_and_b32 s11, s7, 0xff00
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 8
+; GCN-HSA-NEXT: s_bfe_u32 s3, s7, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xff
+; GCN-HSA-NEXT: s_lshl_b32 s11, s11, 8
+; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 8
+; GCN-HSA-NEXT: s_bfe_u32 s13, s6, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff
+; GCN-HSA-NEXT: s_lshl_b32 s10, s10, 8
+; GCN-HSA-NEXT: s_lshr_b32 s14, s5, 8
+; GCN-HSA-NEXT: s_bfe_u32 s15, s5, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xff
+; GCN-HSA-NEXT: s_lshl_b32 s9, s9, 8
+; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8
+; GCN-HSA-NEXT: s_bfe_u32 s17, s4, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff
+; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 8
+; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff0000
+; GCN-HSA-NEXT: s_or_b32 s7, s7, s11
+; GCN-HSA-NEXT: s_and_b32 s11, s12, 0xff0000
+; GCN-HSA-NEXT: s_or_b32 s6, s6, s10
+; GCN-HSA-NEXT: s_and_b32 s10, s14, 0xff0000
+; GCN-HSA-NEXT: s_or_b32 s5, s5, s9
+; GCN-HSA-NEXT: s_and_b32 s9, s16, 0xff0000
+; GCN-HSA-NEXT: s_or_b32 s4, s4, s8
+; GCN-HSA-NEXT: s_or_b32 s2, s3, s2
+; GCN-HSA-NEXT: s_or_b32 s3, s13, s11
+; GCN-HSA-NEXT: s_or_b32 s8, s15, s10
+; GCN-HSA-NEXT: s_or_b32 s9, s17, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s8
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2
+; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3
+; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0
+; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1
+; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xff00
+; GCN-HSA-NEXT: s_and_b32 s7, s3, 0xff00
+; GCN-HSA-NEXT: s_and_b32 s8, s4, 0xff00
+; GCN-HSA-NEXT: s_and_b32 s9, s5, 0xff00
+; GCN-HSA-NEXT: s_lshr_b32 s10, s5, 8
+; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 8
+; GCN-HSA-NEXT: s_lshr_b32 s14, s3, 8
+; GCN-HSA-NEXT: s_lshr_b32 s16, s2, 8
+; GCN-HSA-NEXT: s_bfe_u32 s11, s5, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xff
+; GCN-HSA-NEXT: s_bfe_u32 s13, s4, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff
+; GCN-HSA-NEXT: s_bfe_u32 s15, s3, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xff
+; GCN-HSA-NEXT: s_bfe_u32 s17, s2, 0x80010
+; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff
+; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xff0000
+; GCN-HSA-NEXT: s_lshl_b32 s9, s9, 8
+; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xff0000
+; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 8
+; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xff0000
+; GCN-HSA-NEXT: s_lshl_b32 s7, s7, 8
+; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xff0000
+; GCN-HSA-NEXT: s_lshl_b32 s6, s6, 8
+; GCN-HSA-NEXT: s_or_b32 s10, s11, s10
+; GCN-HSA-NEXT: s_or_b32 s5, s5, s9
+; GCN-HSA-NEXT: s_or_b32 s9, s13, s12
+; GCN-HSA-NEXT: s_or_b32 s4, s4, s8
+; GCN-HSA-NEXT: s_or_b32 s8, s15, s14
+; GCN-HSA-NEXT: s_or_b32 s3, s3, s7
+; GCN-HSA-NEXT: s_or_b32 s7, s17, s16
+; GCN-HSA-NEXT: s_or_b32 s6, s2, s6
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
-; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v7
-; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v7
-; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v7
-; GCN-HSA-NEXT: v_alignbit_b32 v7, v9, v7, 16
-; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff00, v6
-; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v7
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v6
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v6, 16
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v16, 8, v16
-; GCN-HSA-NEXT: v_or_b32_e32 v8, v17, v8
-; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7
-; GCN-HSA-NEXT: v_or_b32_e32 v6, v6, v16
-; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[6:9]
-; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v5
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v4
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v4
-; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v5
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5
-; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v4
-; GCN-HSA-NEXT: v_alignbit_b32 v5, v9, v5, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; GCN-HSA-NEXT: v_alignbit_b32 v9, v7, v4, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v6
-; GCN-HSA-NEXT: v_or_b32_e32 v6, v12, v8
-; GCN-HSA-NEXT: v_or_b32_e32 v4, v13, v4
-; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v5
-; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v9
-; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff00, v2
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2
-; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v3
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3
-; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff00, v0
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3
-; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v1
-; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v1
-; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v1
-; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0
-; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2
-; GCN-HSA-NEXT: v_alignbit_b32 v1, v5, v1, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GCN-HSA-NEXT: v_alignbit_b32 v0, v9, v0, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v13
-; GCN-HSA-NEXT: v_alignbit_b32 v9, v12, v3, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; GCN-HSA-NEXT: v_alignbit_b32 v12, v19, v2, 16
-; GCN-HSA-NEXT: v_lshlrev_b32_e32 v13, 8, v18
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
-; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
-; GCN-HSA-NEXT: v_or_b32_e32 v2, v6, v4
-; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GCN-HSA-NEXT: v_or_b32_e32 v0, v7, v5
-; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v9
-; GCN-HSA-NEXT: v_or_b32_e32 v6, v10, v8
-; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v12
-; GCN-HSA-NEXT: v_or_b32_e32 v4, v11, v13
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i16:
@@ -11104,79 +11236,100 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v7
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v5
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s4, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_u32 s18, s4, 0x80010
-; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s4, 0xff
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s11, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s13, s11, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s11, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s9, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s5, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s31, s5, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s5, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s4, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s35, s4, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s4, 0xff
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 24, v2
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v4
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 8, v2
-; GCN-NOHSA-VI-NEXT: s_bfe_u32 s9, s7, 0x80010
-; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s7, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s10, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s16, s10, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s10, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 8
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s19, s9, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s9, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s8, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s22, s8, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s8, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s7, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s25, s7, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s7, 0xff
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8
-; GCN-NOHSA-VI-NEXT: s_bfe_u32 s12, s6, 0x80010
-; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s6, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_u32 s28, s6, 0x80010
+; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s6, 0xff
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_u32 s15, s5, 0x80010
-; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s5, 0xff
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 16
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s17, s17, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xff0000
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s18, s18, 16
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s30, s30, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s34, s34, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v6
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 8, v6
-; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v10, 8, v0
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff0000, v5
-; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v13, v8, v2, 16
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff0000, v11
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s12, s12, 16
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s15, s15, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xff0000
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xff0000
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s21, s21, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xff0000
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s24, s24, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xff0000
+; GCN-NOHSA-VI-NEXT: s_lshl_b32 s27, s27, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000
-; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000
-; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8
-; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s11
-; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s17
-; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s19, s4
-; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v7, v7, v4, 16
-; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v6, 16
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff0000, v9
-; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xff0000, v10
-; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v12, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xff00ff, v13
-; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s7
-; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s13, s6
-; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s15, s14
-; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s16, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v7
-; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff00ff, v1
-; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3
-; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s14, s11
+; GCN-NOHSA-VI-NEXT: s_or_b32 s14, s19, s18
+; GCN-NOHSA-VI-NEXT: s_or_b32 s18, s31, s30
+; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s33, s5
+; GCN-NOHSA-VI-NEXT: s_or_b32 s19, s35, s34
+; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s36, s4
+; GCN-NOHSA-VI-NEXT: s_or_b32 s12, s13, s12
+; GCN-NOHSA-VI-NEXT: s_or_b32 s13, s16, s15
+; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s17, s10
+; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s20, s9
+; GCN-NOHSA-VI-NEXT: s_or_b32 s15, s22, s21
+; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s23, s8
+; GCN-NOHSA-VI-NEXT: s_or_b32 s16, s25, s24
+; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s26, s7
+; GCN-NOHSA-VI-NEXT: s_or_b32 s17, s28, s27
+; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s29, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s17
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s8
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: s_nop 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s12
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v32i8_to_v32i16:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index a42c71c4849bd..715a22dbf6653 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -7054,30 +7054,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_add_f32_e32 v2, v2, v3
; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7105,16 +7105,17 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v5, v5, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v1, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -7144,16 +7145,17 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_add_f32_e32 v5, v5, v4
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -7411,30 +7413,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_add_f32_e32 v2, v2, v3
; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7462,16 +7464,17 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_add_f32_e32 v5, v5, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v1, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -7502,16 +7505,17 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0
@@ -7761,29 +7765,29 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GFX8-NEXT: v_add_f32_e32 v4, v4, v2
; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -7810,16 +7814,17 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v3, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -7847,16 +7852,17 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v3, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8105,29 +8111,29 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GFX8-NEXT: v_add_f32_e32 v4, v4, v2
; GFX8-NEXT: v_add_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8154,16 +8160,17 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX7-NEXT: v_add_f32_e32 v6, v3, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8192,16 +8199,17 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX6-NEXT: v_add_f32_e32 v6, v3, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 8351d28057564..b3ed754112daa 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -6987,30 +6987,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_max_f32_e32 v2, v2, v3
; GFX8-NEXT: v_max_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7028,26 +7028,27 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX7-NEXT: v_max_f32_e32 v6, v1, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -7067,26 +7068,27 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: ds_read_b32 v3, v0
; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_max_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX6-NEXT: v_max_f32_e32 v6, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -7469,30 +7471,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_max_f32_e32 v2, v2, v3
; GFX8-NEXT: v_max_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7510,26 +7512,27 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX7-NEXT: v_max_f32_e32 v6, v1, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -7550,26 +7553,27 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: ds_read_b32 v0, v4
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_max_f32_e32 v6, v6, v3
-; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX6-NEXT: v_max_f32_e32 v6, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v6, 16
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0
@@ -7938,29 +7942,29 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GFX8-NEXT: v_max_f32_e32 v4, v4, v2
; GFX8-NEXT: v_max_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -7976,33 +7980,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v3, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: ds_read_b32 v4, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX7-NEXT: v_max_f32_e32 v6, v3, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB26_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8013,33 +8018,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v3, v0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: ds_read_b32 v4, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_max_f32_e32 v6, v6, v1
-; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX6-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX6-NEXT: v_max_f32_e32 v6, v3, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB26_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8401,29 +8407,29 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GFX8-NEXT: v_max_f32_e32 v4, v4, v2
; GFX8-NEXT: v_max_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8439,33 +8445,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: ds_read_b32 v4, v0 offset:65532
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX7-NEXT: v_max_f32_e32 v6, v3, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 offset:65532
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB27_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8477,33 +8484,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v3, v0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: ds_read_b32 v4, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_max_f32_e32 v6, v6, v1
-; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX6-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX6-NEXT: v_max_f32_e32 v6, v3, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB27_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index 0c4aca88b3781..a48b8f5c66b2e 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -6987,30 +6987,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_min_f32_e32 v2, v2, v3
; GFX8-NEXT: v_min_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7028,26 +7028,27 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX7-NEXT: v_min_f32_e32 v6, v1, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -7067,26 +7068,27 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: ds_read_b32 v3, v0
; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_min_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX6-NEXT: v_min_f32_e32 v6, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -7469,30 +7471,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_min_f32_e32 v2, v2, v3
; GFX8-NEXT: v_min_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7510,26 +7512,27 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX7-NEXT: v_min_f32_e32 v6, v1, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -7550,26 +7553,27 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: ds_read_b32 v0, v4
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_min_f32_e32 v6, v6, v3
-; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX6-NEXT: v_min_f32_e32 v6, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v6, 16
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0
@@ -7938,29 +7942,29 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GFX8-NEXT: v_min_f32_e32 v4, v4, v2
; GFX8-NEXT: v_min_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -7976,33 +7980,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v3, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: ds_read_b32 v4, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX7-NEXT: v_min_f32_e32 v6, v3, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB26_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8013,33 +8018,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v3, v0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: ds_read_b32 v4, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_min_f32_e32 v6, v6, v1
-; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX6-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX6-NEXT: v_min_f32_e32 v6, v3, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB26_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8401,29 +8407,29 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GFX8-NEXT: v_min_f32_e32 v4, v4, v2
; GFX8-NEXT: v_min_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8439,33 +8445,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
-; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: ds_read_b32 v4, v0 offset:65532
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX7-NEXT: v_min_f32_e32 v6, v3, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 offset:65532
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB27_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -8477,33 +8484,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v3, v0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: ds_read_b32 v4, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[4:5], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_min_f32_e32 v6, v6, v1
-; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GFX6-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX6-NEXT: v_min_f32_e32 v6, v3, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB27_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 37310b614c0db..8c1303d98f802 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -7759,30 +7759,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -7810,16 +7810,17 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_sub_f32_e32 v6, v1, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -7849,16 +7850,17 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_sub_f32_e32 v5, v5, v4
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_sub_f32_e32 v6, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -8241,30 +8243,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
@@ -8292,16 +8294,17 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_sub_f32_e32 v6, v1, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
@@ -8332,16 +8335,17 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_sub_f32_e32 v5, v5, v3
-; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v5, 16
+; GFX6-NEXT: v_sub_f32_e32 v6, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0
@@ -8710,29 +8714,29 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v3, v0
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8759,16 +8763,17 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX7-NEXT: v_sub_f32_e32 v6, v3, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8796,16 +8801,17 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX6-NEXT: v_sub_f32_e32 v6, v3, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -9173,29 +9179,29 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2
; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -9222,16 +9228,17 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX7-NEXT: v_sub_f32_e32 v6, v3, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
@@ -9260,16 +9267,17 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GFX6-NEXT: v_sub_f32_e32 v6, v3, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll
index aab035f811434..645641d009a45 100644
--- a/llvm/test/CodeGen/AMDGPU/packetizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s
; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s
@@ -32,3 +33,5 @@ entry:
store i32 %xyzw, ptr addrspace(1) %out
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll
index cac983a3acfb3..ee43d46a61917 100644
--- a/llvm/test/CodeGen/AMDGPU/permute.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute.ll
@@ -118,14 +118,14 @@ define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %a
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007
+; GCN-NEXT: v_mov_b32_e32 v3, 0x6050403
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GCN-NEXT: flat_load_dword v2, v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_perm_b32 v2, s2, v2, v3
+; GCN-NEXT: v_perm_b32 v2, v2, s2, v3
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index a4ddfee115fa6..4d98c8bb54902 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -8,7 +8,10 @@ define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6060706
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v0
+; GFX10-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -16,9 +19,13 @@ define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[2:3], off
-; GFX9-NEXT: s_mov_b32 s4, 0x6060706
+; GFX9-NEXT: v_mov_b32_e32 v1, 8
+; GFX9-NEXT: s_movk_i32 s4, 0xff
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -327,7 +334,9 @@ define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x4070706
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v0
+; GFX10-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -335,9 +344,10 @@ define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, 0x4070706
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v0
+; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -405,7 +415,8 @@ define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -413,9 +424,10 @@ define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, 0x7060706
+; GFX9-NEXT: s_mov_b32 s4, 0xffff0000
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -457,7 +469,8 @@ define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1
; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -465,9 +478,10 @@ define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, 0x7060706
+; GFX9-NEXT: s_mov_b32 s4, 0xffff0000
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1242,10 +1256,10 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706
-; GFX10-NEXT: global_store_dword v[5:6], v0, off
-; GFX10-NEXT: global_store_dword v[7:8], v1, off
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: global_store_dword v[5:6], v1, off
+; GFX10-NEXT: global_store_dword v[7:8], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: ive_store_div:
@@ -1261,7 +1275,6 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: global_load_dword v10, v[2:3], off
; GFX9-NEXT: s_movk_i32 s4, 0xff
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
-; GFX9-NEXT: s_mov_b32 s5, 0x2000706
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -1270,9 +1283,9 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_perm_b32 v3, v10, v9, s5
+; GFX9-NEXT: v_or_b32_sdwa v2, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[5:6], v0, off
-; GFX9-NEXT: global_store_dword v[7:8], v3, off
+; GFX9-NEXT: global_store_dword v[7:8], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1507,64 +1520,68 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v9, v[0:1], off
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12
-; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14
-; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0
-; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15
-; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16
-; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3
-; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17
-; GFX10-NEXT: v_or_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_trunc_f32_e32 v15, v15
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v11, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v2
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v11
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v20, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v13
+; GFX10-NEXT: v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v15
+; GFX10-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX10-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX10-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
+; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1
+; GFX10-NEXT: v_mul_f32_e32 v16, v3, v16
+; GFX10-NEXT: v_mul_f32_e32 v17, v20, v17
+; GFX10-NEXT: v_ashrrev_i32_e32 v10, 30, v10
+; GFX10-NEXT: v_mul_f32_e32 v18, v3, v18
+; GFX10-NEXT: v_or_b32_e32 v1, 1, v1
; GFX10-NEXT: v_trunc_f32_e32 v16, v16
-; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18
; GFX10-NEXT: v_trunc_f32_e32 v17, v17
-; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11
-; GFX10-NEXT: v_mad_f32 v20, -v15, v1, v2
-; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19
-; GFX10-NEXT: v_or_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_mul_f32_e32 v19, v2, v19
; GFX10-NEXT: v_trunc_f32_e32 v18, v18
-; GFX10-NEXT: v_mad_f32 v2, -v17, v12, v2
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1|
-; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13
-; GFX10-NEXT: v_or_b32_e32 v11, 1, v11
-; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1
-; GFX10-NEXT: v_cvt_i32_f32_e32 v15, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10|
-; GFX10-NEXT: v_or_b32_e32 v13, 1, v13
+; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12
+; GFX10-NEXT: v_mad_f32 v21, -v16, v2, v3
+; GFX10-NEXT: v_mad_f32 v20, -v17, v11, v20
+; GFX10-NEXT: v_or_b32_e32 v10, 1, v10
+; GFX10-NEXT: v_trunc_f32_e32 v19, v19
+; GFX10-NEXT: v_mad_f32 v3, -v18, v13, v3
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v2|
+; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14
+; GFX10-NEXT: v_or_b32_e32 v12, 1, v12
+; GFX10-NEXT: v_mad_f32 v22, -v19, v15, v2
; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v11|
+; GFX10-NEXT: v_or_b32_e32 v14, 1, v14
; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17
; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12|
-; GFX10-NEXT: v_add_nc_u32_e32 v0, v15, v0
-; GFX10-NEXT: v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14|
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_add_nc_u32_e32 v2, v17, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v13, vcc_lo
-; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706
-; GFX10-NEXT: global_store_dword v[5:6], v0, off
-; GFX10-NEXT: global_store_dword v[7:8], v1, off
+; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v3|, |v13|
+; GFX10-NEXT: v_add_nc_u32_e32 v1, v16, v1
+; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v4, 8, v4
+; GFX10-NEXT: v_add_nc_u32_sdwa v2, v17, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v12, vcc_lo
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v15|
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v18, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc_lo
+; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_add_nc_u32_sdwa v10, v19, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v2, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: global_store_dword v[5:6], v1, off
+; GFX10-NEXT: global_store_dword v[7:8], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: sdiv_store_div:
@@ -1576,64 +1593,67 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v4, v[2:3], off
-; GFX9-NEXT: global_load_dword v9, v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, 0x60706
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v9, v[2:3], off
+; GFX9-NEXT: s_movk_i32 s4, 0xff
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4
-; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT: v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v12
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v13
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4
-; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15
-; GFX9-NEXT: v_mul_f32_e32 v16, v11, v16
-; GFX9-NEXT: v_trunc_f32_e32 v15, v15
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1
-; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17
-; GFX9-NEXT: v_mul_f32_e32 v18, v2, v18
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v9
+; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX9-NEXT: v_xor_b32_sdwa v11, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v15, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX9-NEXT: v_xor_b32_sdwa v16, sext(v9), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v9, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v11
+; GFX9-NEXT: v_ashrrev_i32_e32 v11, 30, v14
+; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v16
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_e32 v4, 1, v11
+; GFX9-NEXT: v_or_b32_e32 v11, 1, v14
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v3
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v13
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v15
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v9
+; GFX9-NEXT: v_mul_f32_e32 v14, v10, v14
+; GFX9-NEXT: v_mul_f32_e32 v16, v12, v16
+; GFX9-NEXT: v_trunc_f32_e32 v14, v14
+; GFX9-NEXT: v_mul_f32_e32 v17, v10, v17
+; GFX9-NEXT: v_mul_f32_e32 v18, v3, v18
; GFX9-NEXT: v_trunc_f32_e32 v16, v16
-; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3
-; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10
+; GFX9-NEXT: v_mad_f32 v19, -v14, v3, v10
; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
; GFX9-NEXT: v_trunc_f32_e32 v17, v17
; GFX9-NEXT: v_trunc_f32_e32 v18, v18
-; GFX9-NEXT: v_mad_f32 v11, -v16, v12, v11
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v2|
-; GFX9-NEXT: v_ashrrev_i32_e32 v9, 30, v9
-; GFX9-NEXT: v_or_b32_e32 v10, 1, v10
-; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15
+; GFX9-NEXT: v_mad_f32 v12, -v16, v13, v12
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v3|
+; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_cvt_i32_f32_e32 v14, v14
; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16
-; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3
+; GFX9-NEXT: v_mad_f32 v10, -v17, v15, v10
; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17
-; GFX9-NEXT: v_mad_f32 v2, -v18, v4, v2
+; GFX9-NEXT: v_mad_f32 v3, -v18, v9, v3
; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12|
-; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14
-; GFX9-NEXT: v_or_b32_e32 v9, 1, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13|
-; GFX9-NEXT: v_or_b32_e32 v14, 1, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4|
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc
-; GFX9-NEXT: v_add_u32_e32 v1, v15, v1
-; GFX9-NEXT: v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_add_u32_e32 v3, v17, v3
-; GFX9-NEXT: v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v12|, |v13|
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v10|, |v15|
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v9|
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v11, vcc
+; GFX9-NEXT: v_add_u32_e32 v1, v14, v1
+; GFX9-NEXT: v_add_u32_sdwa v2, v16, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_add_u32_e32 v4, v17, v4
+; GFX9-NEXT: v_add_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[5:6], v1, off
; GFX9-NEXT: global_store_dword v[7:8], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/rotate-add.ll b/llvm/test/CodeGen/AMDGPU/rotate-add.ll
index 53a49c9a21e2c..25346d8923a83 100644
--- a/llvm/test/CodeGen/AMDGPU/rotate-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotate-add.ll
@@ -44,15 +44,19 @@ define i32 @test_rotl_var(i32 %x, i32 %y) {
; SI-LABEL: test_rotl_var:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, v1, v0
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT: v_alignbit_b32 v0, v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: test_rotl_var:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v2, v1, v0
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
-; VI-NEXT: v_alignbit_b32 v0, v0, v0, v1
+; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i32 %x, %y
%sub = sub i32 32, %y
@@ -65,13 +69,19 @@ define i32 @test_rotr_var(i32 %x, i32 %y) {
; SI-LABEL: test_rotr_var:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_alignbit_b32 v0, v0, v0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, v1, v0
+; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: test_rotr_var:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_alignbit_b32 v0, v0, v0, v1
+; VI-NEXT: v_lshrrev_b32_e32 v2, v1, v0
+; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%shr = lshr i32 %x, %y
%sub = sub i32 32, %y
@@ -164,13 +174,21 @@ define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) {
; SI-LABEL: test_fshr_special_case:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_xor_b32_e32 v2, 31, v2
+; SI-NEXT: v_lshlrev_b32_e32 v0, v2, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: test_fshr_special_case:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2
+; VI-NEXT: v_lshrrev_b32_e32 v1, v2, v1
+; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: v_xor_b32_e32 v2, 31, v2
+; VI-NEXT: v_lshlrev_b32_e32 v0, v2, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%shl = lshr i32 %x1, %y
%srli = shl i32 %x0, 1
@@ -259,11 +277,13 @@ define i64 @test_rotl_mul_with_mask_special_case(i64 %i) {
; SI-LABEL: test_rotl_mul_with_mask_special_case:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 7, v0
; SI-NEXT: v_mul_lo_u32 v1, v1, 9
-; SI-NEXT: v_mul_hi_u32 v2, v0, 9
-; SI-NEXT: v_add_i32_e32 v1, vcc, v2, v1
-; SI-NEXT: v_alignbit_b32 v0, v0, v1, 25
-; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT: v_mul_hi_u32 v0, v0, 9
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; SI-NEXT: v_and_b32_e32 v1, 0x80, v2
+; SI-NEXT: v_lshrrev_b32_e32 v0, 25, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -272,9 +292,11 @@ define i64 @test_rotl_mul_with_mask_special_case(i64 %i) {
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mul_lo_u32 v1, v1, 9
; VI-NEXT: v_mul_hi_u32 v2, v0, 9
+; VI-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x80, v0
; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1
-; VI-NEXT: v_alignbit_b32 v0, v0, v1, 25
-; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 25, v1
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_setpc_b64 s[30:31]
%lhs_mul = mul i64 %i, 1152
@@ -289,16 +311,16 @@ define i32 @test_fshl_with_mask_special_case(i32 %x) {
; SI-LABEL: test_fshl_with_mask_special_case:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_or_b32_e32 v1, 1, v0
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 27
+; SI-NEXT: v_alignbit_b32 v0, v0, v0, 27
+; SI-NEXT: v_or_b32_e32 v0, 32, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffffffe1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: test_fshl_with_mask_special_case:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_or_b32_e32 v1, 1, v0
-; VI-NEXT: v_alignbit_b32 v0, v1, v0, 27
+; VI-NEXT: v_alignbit_b32 v0, v0, v0, 27
+; VI-NEXT: v_or_b32_e32 v0, 32, v0
; VI-NEXT: v_and_b32_e32 v0, 0xffffffe1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%or1 = or i32 %x, 1
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 0a746b0a3f572..008d8cef23797 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -25,12 +25,14 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshl_b32 s4, s2, s3
; SI-NEXT: s_sub_i32 s3, 32, s3
+; SI-NEXT: s_lshr_b32 s2, s2, s3
+; SI-NEXT: s_or_b32 s2, s4, s2
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0
+; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -38,11 +40,13 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_lshl_b32 s4, s2, s3
; GFX8-NEXT: s_sub_i32 s3, 32, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0
+; GFX8-NEXT: s_lshr_b32 s2, s2, s3
+; GFX8-NEXT: s_or_b32 s2, s4, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -51,19 +55,24 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sub_i32 s3, 32, s3
-; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX10-NEXT: s_sub_i32 s4, 32, s3
+; GFX10-NEXT: s_lshl_b32 s3, s2, s3
+; GFX10-NEXT: s_lshr_b32 s2, s2, s4
+; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotl_i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s3, 32, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX11-NEXT: s_sub_i32 s4, 32, s3
+; GFX11-NEXT: s_lshl_b32 s3, s2, s3
+; GFX11-NEXT: s_lshr_b32 s2, s2, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
entry:
@@ -97,14 +106,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshl_b32 s6, s0, s2
+; SI-NEXT: s_lshl_b32 s8, s1, s3
; SI-NEXT: s_sub_i32 s3, 32, s3
; SI-NEXT: s_sub_i32 s2, 32, s2
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT: s_lshr_b32 s0, s0, s2
+; SI-NEXT: s_lshr_b32 s1, s1, s3
+; SI-NEXT: s_or_b32 s1, s8, s1
+; SI-NEXT: s_or_b32 s0, s6, s0
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -113,13 +126,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s2, 32, s2
+; GFX8-NEXT: s_lshl_b32 s6, s0, s2
+; GFX8-NEXT: s_lshl_b32 s7, s1, s3
; GFX8-NEXT: s_sub_i32 s3, 32, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
+; GFX8-NEXT: s_sub_i32 s2, 32, s2
+; GFX8-NEXT: s_lshr_b32 s0, s0, s2
+; GFX8-NEXT: s_lshr_b32 s1, s1, s3
+; GFX8-NEXT: s_or_b32 s1, s7, s1
+; GFX8-NEXT: s_or_b32 s0, s6, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
@@ -131,10 +148,16 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sub_i32 s3, 32, s3
+; GFX10-NEXT: s_lshl_b32 s4, s0, s2
+; GFX10-NEXT: s_lshl_b32 s5, s1, s3
; GFX10-NEXT: s_sub_i32 s2, 32, s2
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX10-NEXT: s_sub_i32 s3, 32, s3
+; GFX10-NEXT: s_lshr_b32 s0, s0, s2
+; GFX10-NEXT: s_lshr_b32 s1, s1, s3
+; GFX10-NEXT: s_or_b32 s0, s4, s0
+; GFX10-NEXT: s_or_b32 s1, s5, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
@@ -143,12 +166,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s3, 32, s3
+; GFX11-NEXT: s_lshl_b32 s6, s0, s2
+; GFX11-NEXT: s_lshl_b32 s7, s1, s3
; GFX11-NEXT: s_sub_i32 s2, 32, s2
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX11-NEXT: s_sub_i32 s3, 32, s3
+; GFX11-NEXT: s_lshr_b32 s0, s0, s2
+; GFX11-NEXT: s_lshr_b32 s1, s1, s3
+; GFX11-NEXT: s_or_b32 s0, s6, s0
+; GFX11-NEXT: s_or_b32 s1, s7, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
@@ -188,20 +217,28 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_i32 s4, 32, s12
-; SI-NEXT: s_sub_i32 s5, 32, s13
-; SI-NEXT: s_sub_i32 s6, 32, s15
-; SI-NEXT: s_sub_i32 s7, 32, s14
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
+; SI-NEXT: s_lshl_b32 s2, s8, s12
+; SI-NEXT: s_lshl_b32 s4, s9, s13
+; SI-NEXT: s_lshl_b32 s5, s10, s14
+; SI-NEXT: s_lshl_b32 s6, s11, s15
+; SI-NEXT: s_sub_i32 s7, 32, s15
+; SI-NEXT: s_sub_i32 s14, 32, s14
+; SI-NEXT: s_sub_i32 s13, 32, s13
+; SI-NEXT: s_sub_i32 s12, 32, s12
+; SI-NEXT: s_lshr_b32 s8, s8, s12
+; SI-NEXT: s_lshr_b32 s9, s9, s13
+; SI-NEXT: s_lshr_b32 s10, s10, s14
+; SI-NEXT: s_lshr_b32 s7, s11, s7
+; SI-NEXT: s_or_b32 s6, s6, s7
+; SI-NEXT: s_or_b32 s5, s5, s10
+; SI-NEXT: s_or_b32 s4, s4, s9
+; SI-NEXT: s_or_b32 s7, s2, s8
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s7
-; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
-; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s6
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -210,19 +247,27 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s5, 32, s15
-; GFX8-NEXT: s_sub_i32 s4, 32, s14
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: s_sub_i32 s3, 32, s13
-; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_sub_i32 s2, 32, s12
-; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_lshl_b32 s2, s8, s12
+; GFX8-NEXT: s_lshl_b32 s3, s9, s13
+; GFX8-NEXT: s_sub_i32 s6, 32, s15
+; GFX8-NEXT: s_sub_i32 s7, 32, s14
+; GFX8-NEXT: s_sub_i32 s13, 32, s13
+; GFX8-NEXT: s_sub_i32 s12, 32, s12
+; GFX8-NEXT: s_lshl_b32 s4, s10, s14
+; GFX8-NEXT: s_lshl_b32 s5, s11, s15
+; GFX8-NEXT: s_lshr_b32 s8, s8, s12
+; GFX8-NEXT: s_lshr_b32 s9, s9, s13
+; GFX8-NEXT: s_lshr_b32 s7, s10, s7
+; GFX8-NEXT: s_lshr_b32 s6, s11, s6
+; GFX8-NEXT: s_or_b32 s5, s5, s6
+; GFX8-NEXT: s_or_b32 s4, s4, s7
+; GFX8-NEXT: s_or_b32 s3, s3, s9
+; GFX8-NEXT: s_or_b32 s2, s2, s8
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -234,14 +279,26 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sub_i32 s2, 32, s12
-; GFX10-NEXT: s_sub_i32 s3, 32, s13
-; GFX10-NEXT: s_sub_i32 s4, 32, s15
-; GFX10-NEXT: s_sub_i32 s5, 32, s14
-; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s4
-; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s5
-; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s3
-; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s2
+; GFX10-NEXT: s_lshl_b32 s2, s8, s12
+; GFX10-NEXT: s_lshl_b32 s3, s9, s13
+; GFX10-NEXT: s_sub_i32 s6, 32, s15
+; GFX10-NEXT: s_sub_i32 s7, 32, s14
+; GFX10-NEXT: s_sub_i32 s12, 32, s12
+; GFX10-NEXT: s_sub_i32 s13, 32, s13
+; GFX10-NEXT: s_lshl_b32 s4, s10, s14
+; GFX10-NEXT: s_lshl_b32 s5, s11, s15
+; GFX10-NEXT: s_lshr_b32 s8, s8, s12
+; GFX10-NEXT: s_lshr_b32 s9, s9, s13
+; GFX10-NEXT: s_lshr_b32 s6, s11, s6
+; GFX10-NEXT: s_lshr_b32 s7, s10, s7
+; GFX10-NEXT: s_or_b32 s5, s5, s6
+; GFX10-NEXT: s_or_b32 s4, s4, s7
+; GFX10-NEXT: s_or_b32 s2, s2, s8
+; GFX10-NEXT: s_or_b32 s3, s3, s9
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -250,16 +307,27 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s2, 32, s12
-; GFX11-NEXT: s_sub_i32 s3, 32, s13
-; GFX11-NEXT: s_sub_i32 s4, 32, s15
-; GFX11-NEXT: s_sub_i32 s5, 32, s14
-; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s4
-; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s5
-; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s2
+; GFX11-NEXT: s_lshl_b32 s2, s8, s12
+; GFX11-NEXT: s_lshl_b32 s3, s9, s13
+; GFX11-NEXT: s_sub_i32 s6, 32, s15
+; GFX11-NEXT: s_sub_i32 s7, 32, s14
+; GFX11-NEXT: s_sub_i32 s12, 32, s12
+; GFX11-NEXT: s_sub_i32 s13, 32, s13
+; GFX11-NEXT: s_lshl_b32 s4, s10, s14
+; GFX11-NEXT: s_lshl_b32 s5, s11, s15
+; GFX11-NEXT: s_lshr_b32 s8, s8, s12
+; GFX11-NEXT: s_lshr_b32 s9, s9, s13
+; GFX11-NEXT: s_lshr_b32 s6, s11, s6
+; GFX11-NEXT: s_lshr_b32 s7, s10, s7
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s4, s4, s7
+; GFX11-NEXT: s_or_b32 s2, s2, s8
+; GFX11-NEXT: s_or_b32 s3, s3, s9
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT: v_mov_b32_e32 v2, s4
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6e361d6e297e..cb1266a048764 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -5,6 +5,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; R600-LABEL: rotr_i32:
@@ -22,12 +23,15 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sub_i32 s4, 32, s3
+; SI-NEXT: s_lshr_b32 s3, s2, s3
+; SI-NEXT: s_lshl_b32 s2, s2, s4
+; SI-NEXT: s_or_b32 s2, s2, s3
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0
+; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -35,10 +39,13 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0
+; GFX8-NEXT: s_sub_i32 s4, 32, s3
+; GFX8-NEXT: s_lshr_b32 s3, s2, s3
+; GFX8-NEXT: s_lshl_b32 s2, s2, s4
+; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -47,18 +54,41 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX10-NEXT: s_sub_i32 s4, 32, s3
+; GFX10-NEXT: s_lshl_b32 s4, s2, s4
+; GFX10-NEXT: s_lshr_b32 s2, s2, s3
+; GFX10-NEXT: s_or_b32 s2, s4, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotr_i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX11-NEXT: s_sub_i32 s4, 32, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s4, s2, s4
+; GFX11-NEXT: s_lshr_b32 s2, s2, s3
+; GFX11-NEXT: s_or_b32 s2, s4, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: rotr_i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_sub_co_i32 s4, 32, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s4, s2, s4
+; GFX12-NEXT: s_lshr_b32 s2, s2, s3
+; GFX12-NEXT: s_or_b32 s2, s4, s2
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
entry:
%tmp0 = sub i32 32, %y
%tmp1 = shl i32 %x, %tmp0
@@ -86,12 +116,18 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT: s_sub_i32 s6, 32, s3
+; SI-NEXT: s_sub_i32 s8, 32, s2
+; SI-NEXT: s_lshr_b32 s2, s0, s2
+; SI-NEXT: s_lshr_b32 s3, s1, s3
+; SI-NEXT: s_lshl_b32 s0, s0, s8
+; SI-NEXT: s_lshl_b32 s1, s1, s6
+; SI-NEXT: s_or_b32 s1, s1, s3
+; SI-NEXT: s_or_b32 s0, s0, s2
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -100,11 +136,17 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
+; GFX8-NEXT: s_sub_i32 s6, 32, s3
+; GFX8-NEXT: s_sub_i32 s7, 32, s2
+; GFX8-NEXT: s_lshr_b32 s2, s0, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s7
+; GFX8-NEXT: s_lshl_b32 s6, s1, s6
+; GFX8-NEXT: s_lshr_b32 s1, s1, s3
+; GFX8-NEXT: s_or_b32 s1, s6, s1
+; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
@@ -116,8 +158,16 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX10-NEXT: s_sub_i32 s4, 32, s3
+; GFX10-NEXT: s_sub_i32 s5, 32, s2
+; GFX10-NEXT: s_lshr_b32 s2, s0, s2
+; GFX10-NEXT: s_lshr_b32 s3, s1, s3
+; GFX10-NEXT: s_lshl_b32 s0, s0, s5
+; GFX10-NEXT: s_lshl_b32 s1, s1, s4
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_or_b32 s1, s1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
@@ -126,12 +176,40 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX11-NEXT: s_sub_i32 s6, 32, s3
+; GFX11-NEXT: s_sub_i32 s7, 32, s2
+; GFX11-NEXT: s_lshr_b32 s2, s0, s2
+; GFX11-NEXT: s_lshr_b32 s3, s1, s3
+; GFX11-NEXT: s_lshl_b32 s0, s0, s7
+; GFX11-NEXT: s_lshl_b32 s1, s1, s6
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_or_b32 s1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: rotr_v2i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_sub_co_i32 s6, 32, s3
+; GFX12-NEXT: s_sub_co_i32 s7, 32, s2
+; GFX12-NEXT: s_lshr_b32 s2, s0, s2
+; GFX12-NEXT: s_lshr_b32 s3, s1, s3
+; GFX12-NEXT: s_lshl_b32 s0, s0, s7
+; GFX12-NEXT: s_lshl_b32 s1, s1, s6
+; GFX12-NEXT: s_or_b32 s0, s0, s2
+; GFX12-NEXT: s_or_b32 s1, s1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT: s_endpgm
entry:
%tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
%tmp1 = shl <2 x i32> %x, %tmp0
@@ -161,16 +239,28 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s15
-; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
-; SI-NEXT: v_mov_b32_e32 v0, s14
-; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; SI-NEXT: s_sub_i32 s2, 32, s15
+; SI-NEXT: s_sub_i32 s4, 32, s14
+; SI-NEXT: s_sub_i32 s5, 32, s13
+; SI-NEXT: s_sub_i32 s6, 32, s12
+; SI-NEXT: s_lshr_b32 s7, s8, s12
+; SI-NEXT: s_lshr_b32 s12, s9, s13
+; SI-NEXT: s_lshr_b32 s13, s10, s14
+; SI-NEXT: s_lshr_b32 s14, s11, s15
+; SI-NEXT: s_lshl_b32 s6, s8, s6
+; SI-NEXT: s_lshl_b32 s5, s9, s5
+; SI-NEXT: s_lshl_b32 s4, s10, s4
+; SI-NEXT: s_lshl_b32 s2, s11, s2
+; SI-NEXT: s_or_b32 s8, s2, s14
+; SI-NEXT: s_or_b32 s4, s4, s13
+; SI-NEXT: s_or_b32 s5, s5, s12
+; SI-NEXT: s_or_b32 s6, s6, s7
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_mov_b32_e32 v2, s4
+; SI-NEXT: v_mov_b32_e32 v3, s8
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -179,15 +269,27 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s15
-; GFX8-NEXT: v_mov_b32_e32 v1, s14
-; GFX8-NEXT: v_mov_b32_e32 v4, s13
-; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
-; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1
-; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4
-; GFX8-NEXT: v_mov_b32_e32 v0, s12
+; GFX8-NEXT: s_sub_i32 s2, 32, s15
+; GFX8-NEXT: s_sub_i32 s3, 32, s14
+; GFX8-NEXT: s_sub_i32 s4, 32, s13
+; GFX8-NEXT: s_sub_i32 s5, 32, s12
+; GFX8-NEXT: s_lshl_b32 s5, s8, s5
+; GFX8-NEXT: s_lshl_b32 s4, s9, s4
+; GFX8-NEXT: s_lshl_b32 s3, s10, s3
+; GFX8-NEXT: s_lshl_b32 s2, s11, s2
+; GFX8-NEXT: s_lshr_b32 s6, s8, s12
+; GFX8-NEXT: s_lshr_b32 s7, s9, s13
+; GFX8-NEXT: s_lshr_b32 s8, s10, s14
+; GFX8-NEXT: s_lshr_b32 s9, s11, s15
+; GFX8-NEXT: s_or_b32 s2, s2, s9
+; GFX8-NEXT: s_or_b32 s3, s3, s8
+; GFX8-NEXT: s_or_b32 s4, s4, s7
+; GFX8-NEXT: s_or_b32 s5, s5, s6
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s5
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -199,10 +301,26 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s15
-; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s14
-; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s13
-; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s12
+; GFX10-NEXT: s_sub_i32 s2, 32, s15
+; GFX10-NEXT: s_sub_i32 s3, 32, s14
+; GFX10-NEXT: s_sub_i32 s4, 32, s13
+; GFX10-NEXT: s_sub_i32 s5, 32, s12
+; GFX10-NEXT: s_lshr_b32 s6, s8, s12
+; GFX10-NEXT: s_lshr_b32 s7, s9, s13
+; GFX10-NEXT: s_lshr_b32 s12, s10, s14
+; GFX10-NEXT: s_lshr_b32 s13, s11, s15
+; GFX10-NEXT: s_lshl_b32 s5, s8, s5
+; GFX10-NEXT: s_lshl_b32 s4, s9, s4
+; GFX10-NEXT: s_lshl_b32 s2, s11, s2
+; GFX10-NEXT: s_lshl_b32 s3, s10, s3
+; GFX10-NEXT: s_or_b32 s2, s2, s13
+; GFX10-NEXT: s_or_b32 s3, s3, s12
+; GFX10-NEXT: s_or_b32 s5, s5, s6
+; GFX10-NEXT: s_or_b32 s4, s4, s7
+; GFX10-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: v_mov_b32_e32 v2, s3
+; GFX10-NEXT: v_mov_b32_e32 v3, s2
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -211,14 +329,58 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s15
-; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s14
-; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s13
-; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s12
+; GFX11-NEXT: s_sub_i32 s2, 32, s15
+; GFX11-NEXT: s_sub_i32 s3, 32, s14
+; GFX11-NEXT: s_sub_i32 s4, 32, s13
+; GFX11-NEXT: s_sub_i32 s5, 32, s12
+; GFX11-NEXT: s_lshr_b32 s6, s8, s12
+; GFX11-NEXT: s_lshr_b32 s7, s9, s13
+; GFX11-NEXT: s_lshr_b32 s12, s10, s14
+; GFX11-NEXT: s_lshr_b32 s13, s11, s15
+; GFX11-NEXT: s_lshl_b32 s5, s8, s5
+; GFX11-NEXT: s_lshl_b32 s4, s9, s4
+; GFX11-NEXT: s_lshl_b32 s2, s11, s2
+; GFX11-NEXT: s_lshl_b32 s3, s10, s3
+; GFX11-NEXT: s_or_b32 s2, s2, s13
+; GFX11-NEXT: s_or_b32 s3, s3, s12
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s4, s4, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: rotr_v4i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_sub_co_i32 s2, 32, s15
+; GFX12-NEXT: s_sub_co_i32 s3, 32, s14
+; GFX12-NEXT: s_sub_co_i32 s4, 32, s13
+; GFX12-NEXT: s_sub_co_i32 s5, 32, s12
+; GFX12-NEXT: s_lshr_b32 s6, s8, s12
+; GFX12-NEXT: s_lshr_b32 s7, s9, s13
+; GFX12-NEXT: s_lshr_b32 s12, s10, s14
+; GFX12-NEXT: s_lshr_b32 s13, s11, s15
+; GFX12-NEXT: s_lshl_b32 s5, s8, s5
+; GFX12-NEXT: s_lshl_b32 s4, s9, s4
+; GFX12-NEXT: s_lshl_b32 s2, s11, s2
+; GFX12-NEXT: s_lshl_b32 s3, s10, s3
+; GFX12-NEXT: s_or_b32 s2, s2, s13
+; GFX12-NEXT: s_or_b32 s3, s3, s12
+; GFX12-NEXT: s_or_b32 s5, s5, s6
+; GFX12-NEXT: s_or_b32 s4, s4, s7
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
+; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_endpgm
entry:
%tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
%tmp1 = shl <4 x i32> %x, %tmp0
@@ -357,6 +519,25 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-FAKE16-NEXT: global_store_b16 v[4:5], v0, off offset:8
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_rotr_i16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v2, v[2:3], off offset:48
+; GFX12-NEXT: global_load_u16 v0, v[0:1], off offset:32
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_sub_nc_u16 v1, 0, v2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b16 v2, v2, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b16 v0, v1, v0
+; GFX12-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX12-NEXT: global_store_b16 v[4:5], v0, off offset:8
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16
%a = load i16, ptr addrspace(1) %arrayidx
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 3fbfd756b97e6..107fc8aaa86a1 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -17,8 +17,9 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v1, v0
@@ -39,8 +40,8 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -85,8 +86,9 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v1, v0
@@ -107,8 +109,8 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
diff --git a/llvm/test/Transforms/InferAddressSpaces/SPIRV/generic-cast-explicit.ll b/llvm/test/Transforms/InferAddressSpaces/SPIRV/generic-cast-explicit.ll
index aa39797d74a10..a1710a9b5a7a8 100644
--- a/llvm/test/Transforms/InferAddressSpaces/SPIRV/generic-cast-explicit.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/SPIRV/generic-cast-explicit.ll
@@ -1,15 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; This test checks that the address space casts for SPIR-V generic pointer casts
; are lowered correctly by the infer-address-spaces pass.
; RUN: opt < %s -passes=infer-address-spaces -S --mtriple=spirv64-unknown-unknown | FileCheck %s
-; Casting a global pointer to a global pointer.
+; Casting a global pointer to a global pointer.
; The uses of c2 will be replaced with %global.
; CHECK: @kernel1(ptr addrspace(1) %global)
define i1 @kernel1(ptr addrspace(1) %global) {
%c1 = addrspacecast ptr addrspace(1) %global to ptr addrspace(4)
%c2 = call ptr addrspace(1) @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1)
; CHECK: %b1 = icmp eq ptr addrspace(1) %global, null
- %b1 = icmp eq ptr addrspace(1) %c2, null
+ %b1 = icmp eq ptr addrspace(1) %c2, null
ret i1 %b1
}
@@ -31,7 +32,7 @@ define i1 @kernel3(ptr addrspace(1) %global) {
%c1 = addrspacecast ptr addrspace(1) %global to ptr addrspace(4)
%c2 = call ptr @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1)
; CHECK: %b1 = icmp eq ptr null, null
- %b1 = icmp eq ptr %c2, null
+ %b1 = icmp eq ptr %c2, null
ret i1 %b1
}
@@ -42,7 +43,7 @@ define i1 @kernel4(ptr addrspace(3) %local) {
%c1 = addrspacecast ptr addrspace(3) %local to ptr addrspace(4)
%c2 = call ptr addrspace(3) @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1)
; CHECK: %b1 = icmp eq ptr addrspace(3) %local, null
- %b1 = icmp eq ptr addrspace(3) %c2, null
+ %b1 = icmp eq ptr addrspace(3) %c2, null
ret i1 %b1
}
@@ -53,7 +54,7 @@ define i1 @kernel5(ptr addrspace(3) %local) {
%c1 = addrspacecast ptr addrspace(3) %local to ptr addrspace(4)
%c2 = call ptr addrspace(1) @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1)
; CHECK: %b1 = icmp eq ptr addrspace(1) null, null
- %b1 = icmp eq ptr addrspace(1) %c2, null
+ %b1 = icmp eq ptr addrspace(1) %c2, null
ret i1 %b1
}
@@ -64,7 +65,7 @@ define i1 @kernel6(ptr addrspace(3) %local) {
%c1 = addrspacecast ptr addrspace(3) %local to ptr addrspace(4)
%c2 = call ptr @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1)
; CHECK: %b1 = icmp eq ptr null, null
- %b1 = icmp eq ptr %c2, null
+ %b1 = icmp eq ptr %c2, null
ret i1 %b1
}
@@ -75,7 +76,7 @@ define i1 @kernel7(ptr %private) {
%c1 = addrspacecast ptr %private to ptr addrspace(4)
%c2 = call ptr @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1)
; CHECK: %b1 = icmp eq ptr %private, null
- %b1 = icmp eq ptr %c2, null
+ %b1 = icmp eq ptr %c2, null
ret i1 %b1
}
More information about the llvm-commits
mailing list