[clang-tools-extra] [llvm] [InstCombine] Fold `icmp samesign u{gt/ge/lt/le} (X +nsw C2), C` -> `icmp s{gt/ge/lt/le} X, (C - C2)` (PR #169960)
Tirthankar Mazumder via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 30 03:40:32 PST 2025
Martin =?utf-8?q?Storsjö?= <martin at martin.st>,Florian Hahn
<flo at fhahn.com>,Luo Yuanke <lyk_03 at hotmail.com>,mitchell
<mitchell.xu2 at gmail.com>,Qihan Cai <caiqihan021 at hotmail.com>,wermos
<63574588+wermos at users.noreply.github.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/169960 at github.com>
https://github.com/wermos updated https://github.com/llvm/llvm-project/pull/169960
>From bb2f1ae26fdc7c3782ea953d0f8b97f50f702aac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin at martin.st>
Date: Fri, 28 Nov 2025 20:45:18 +0200
Subject: [PATCH 1/7] [llvm-readobj] Remove a leftover comment from
6ad4fdacaeea4777e98a3ab41512c49d3d1b6151. NFC.
This case did get documented upstream, in
https://github.com/MicrosoftDocs/cpp-docs/pull/4202, and the
way that llvm-readobj prints it, implemented in that commit, is
correct.
---
llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp | 3 ---
1 file changed, 3 deletions(-)
diff --git a/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp b/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
index c6e409c63ef3a..b21a55085f833 100644
--- a/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/llvm/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -1478,9 +1478,6 @@ bool Decoder::dumpPackedARM64Entry(const object::COFFObjectFile &COFF,
19 + 2 * I + 1, 16 * I);
}
}
- // CR=2 is yet undocumented, see
- // https://github.com/MicrosoftDocs/cpp-docs/pull/4202 for upstream
- // progress on getting it documented.
if (RF.CR() == 2)
SW.startLine() << "pacibsp\n";
SW.startLine() << "end\n";
>From b6fa5a47e97cb3827cb4cae7905a6e0e08c949b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin at martin.st>
Date: Fri, 28 Nov 2025 20:28:52 +0200
Subject: [PATCH 2/7] [MC] [Win64EH] Fix the operator ordering for
UOP_SaveFPLRX. NFC.
The encoded offset should be (OffsetInBytes/8)-1 due to an
implicit offset of 1. Previously the operator ordering was
inverted. As the offset is a multiple of 8, the incorrect
operator ordering did produce the right result in all cases
anyway.
---
llvm/lib/MC/MCWin64EH.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index 6d146f6cedd6e..a7ce8d527250f 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -673,7 +673,7 @@ static void ARM64EmitUnwindCode(MCStreamer &streamer,
break;
case Win64EH::UOP_SaveFPLRX:
b = 0x80;
- b |= ((inst.Offset - 1) >> 3) & 0x3F;
+ b |= ((inst.Offset >> 3) - 1) & 0x3F;
streamer.emitInt8(b);
break;
case Win64EH::UOP_SaveFPLR:
>From 89ff7b1948103ce7865d1fe5533556cb0677b589 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 28 Nov 2025 22:26:19 +0000
Subject: [PATCH 3/7] [LV] Vectorize selecting last IV of min/max element.
(#141431)
Add support for vectorizing loops that select the index of the minimum
or maximum element. The patch implements vectorizing those patterns by
combining Min/Max and FindFirstIV reductions.
It extends matching Min/Max reductions to allow in-loop users that are
FindLastIV reductions. It records a flag indicating that the Min/Max
reduction is used by another reduction. The extra user is then check as
part of the new `handleMultiUseReductions` VPlan transformation.
It processes any reduction that has other reduction users. The reduction
using the min/max reduction currently must be a FindLastIV reduction,
which needs adjusting to compute the correct result:
1. We need to find the last IV for which the condition based on the
min/max reduction is true,
2. Compare the partial min/max reduction result to its final value and,
3. Select the lanes of the partial FindLastIV reductions which
correspond to the lanes matching the min/max reduction result.
Depends on https://github.com/llvm/llvm-project/pull/140451
PR: https://github.com/llvm/llvm-project/pull/141431
---
llvm/include/llvm/Analysis/IVDescriptors.h | 18 +-
llvm/lib/Analysis/IVDescriptors.cpp | 51 +++
llvm/lib/Transforms/Utils/LoopUnroll.cpp | 2 +
.../Vectorize/LoopVectorizationLegality.cpp | 5 +
.../Transforms/Vectorize/LoopVectorize.cpp | 20 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 24 +-
.../Vectorize/VPlanConstruction.cpp | 176 ++++++++++-
.../Transforms/Vectorize/VPlanPatternMatch.h | 6 +
.../Transforms/Vectorize/VPlanTransforms.h | 5 +
.../LoopVectorize/AArch64/select-index.ll | 297 ++++++++++++++++--
.../select-index-interleaving.ll | 228 ++++++++++++--
.../LoopVectorize/select-smax-last-index.ll | 135 +++++++-
.../LoopVectorize/select-smin-last-index.ll | 135 +++++++-
.../LoopVectorize/select-umax-last-index.ll | 135 +++++++-
.../LoopVectorize/select-umin-last-index.ll | 180 +++++++++--
15 files changed, 1283 insertions(+), 134 deletions(-)
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 2c8484fde5b16..fc141ed6d96fe 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -95,12 +95,17 @@ class RecurrenceDescriptor {
RecurKind K, FastMathFlags FMF, Instruction *ExactFP,
Type *RT, bool Signed, bool Ordered,
SmallPtrSetImpl<Instruction *> &CI,
- unsigned MinWidthCastToRecurTy)
+ unsigned MinWidthCastToRecurTy,
+ bool PhiHasUsesOutsideReductionChain = false)
: IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit),
Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT),
IsSigned(Signed), IsOrdered(Ordered),
+ PhiHasUsesOutsideReductionChain(PhiHasUsesOutsideReductionChain),
MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) {
CastInsts.insert_range(CI);
+ assert(
+ (!PhiHasUsesOutsideReductionChain || isMinMaxRecurrenceKind(K)) &&
+ "Only min/max recurrences are allowed to have multiple uses currently");
}
/// This POD struct holds information about a potential recurrence operation.
@@ -339,6 +344,13 @@ class RecurrenceDescriptor {
/// Expose an ordered FP reduction to the instance users.
bool isOrdered() const { return IsOrdered; }
+ /// Returns true if the reduction PHI has any uses outside the reduction
+ /// chain. This is relevant for min/max reductions that are part of a
+ /// FindLastIV pattern.
+ bool hasUsesOutsideReductionChain() const {
+ return PhiHasUsesOutsideReductionChain;
+ }
+
/// Attempts to find a chain of operations from Phi to LoopExitInst that can
/// be treated as a set of reductions instructions for in-loop reductions.
LLVM_ABI SmallVector<Instruction *, 4> getReductionOpChain(PHINode *Phi,
@@ -376,6 +388,10 @@ class RecurrenceDescriptor {
// Currently only a non-reassociative FAdd can be considered in-order,
// if it is also the only FAdd in the PHI's use chain.
bool IsOrdered = false;
+ // True if the reduction PHI has in-loop users outside the reduction chain.
+ // This is relevant for min/max reductions that are part of a FindLastIV
+ // pattern.
+ bool PhiHasUsesOutsideReductionChain = false;
// Instructions used for type-promoting the recurrence.
SmallPtrSet<Instruction *, 8> CastInsts;
// The minimum width used by the recurrence.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 4d21f1c7e2de2..7624e0ed6f2b0 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -216,6 +216,52 @@ static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
return true;
}
+/// Returns true if \p Phi is a min/max reduction matching \p Kind where \p Phi
+/// is used outside the reduction chain. This is common for loops selecting the
+/// index of a minimum/maximum value (argmin/argmax).
+static bool isMinMaxReductionPhiWithUsersOutsideReductionChain(
+ PHINode *Phi, RecurKind Kind, Loop *TheLoop, RecurrenceDescriptor &RedDes) {
+ BasicBlock *Latch = TheLoop->getLoopLatch();
+ if (!Latch)
+ return false;
+
+ assert(Phi->getNumIncomingValues() == 2 && "phi must have 2 incoming values");
+ Value *Inc = Phi->getIncomingValueForBlock(Latch);
+ if (Phi->hasOneUse() || !Inc->hasOneUse() ||
+ !RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
+ return false;
+
+ Value *A, *B;
+ bool IsMinMax = [&]() {
+ switch (Kind) {
+ case RecurKind::UMax:
+ return match(Inc, m_UMax(m_Value(A), m_Value(B)));
+ case RecurKind::UMin:
+ return match(Inc, m_UMin(m_Value(A), m_Value(B)));
+ case RecurKind::SMax:
+ return match(Inc, m_SMax(m_Value(A), m_Value(B)));
+ case RecurKind::SMin:
+ return match(Inc, m_SMin(m_Value(A), m_Value(B)));
+ default:
+ llvm_unreachable("all min/max kinds must be handled");
+ }
+ }();
+ if (!IsMinMax)
+ return false;
+
+ if (A == B || (A != Phi && B != Phi))
+ return false;
+
+ SmallPtrSet<Instruction *, 4> CastInsts;
+ Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
+ RedDes =
+ RecurrenceDescriptor(RdxStart, /*Exit=*/nullptr, /*Store=*/nullptr, Kind,
+ FastMathFlags(), /*ExactFP=*/nullptr, Phi->getType(),
+ /*Signed=*/false, /*Ordered=*/false, CastInsts,
+ /*MinWidthCastToRecurTy=*/-1U, /*PhiMultiUse=*/true);
+ return true;
+}
+
bool RecurrenceDescriptor::AddReductionVar(
PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
@@ -227,6 +273,11 @@ bool RecurrenceDescriptor::AddReductionVar(
if (Phi->getParent() != TheLoop->getHeader())
return false;
+ // Check for min/max reduction variables that feed other users in the loop.
+ if (isMinMaxReductionPhiWithUsersOutsideReductionChain(Phi, Kind, TheLoop,
+ RedDes))
+ return true;
+
// Obtain the reduction start value from the value that comes from the loop
// preheader.
Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 5f1db9c54b291..0f256398e5b1e 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -1254,6 +1254,8 @@ llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
/*DemandedBits=*/nullptr,
/*AC=*/nullptr, /*DT=*/nullptr, SE))
return std::nullopt;
+ if (RdxDesc.hasUsesOutsideReductionChain())
+ return std::nullopt;
RecurKind RK = RdxDesc.getRecurrenceKind();
// Skip unsupported reductions.
// TODO: Handle additional reductions, including min-max reductions.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 379f4e6602a7d..f2e9c3146b0e8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -877,6 +877,11 @@ bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
AllowedExit.insert(RedDes.getLoopExitInstr());
Reductions[Phi] = RedDes;
+ assert((!RedDes.hasUsesOutsideReductionChain() ||
+ RecurrenceDescriptor::isMinMaxRecurrenceKind(
+ RedDes.getRecurrenceKind())) &&
+ "Only min/max recurrences are allowed to have multiple uses "
+ "currently");
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 62b68232925d9..c8495f1f32bd8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6593,6 +6593,11 @@ void LoopVectorizationCostModel::collectInLoopReductions() {
PHINode *Phi = Reduction.first;
const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
+ // separately and should not be considered for in-loop reductions.
+ if (RdxDesc.hasUsesOutsideReductionChain())
+ continue;
+
// We don't collect reductions that are type promoted (yet).
if (RdxDesc.getRecurrenceType() != Phi->getType())
continue;
@@ -7998,9 +8003,10 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
MapVector<Instruction *,
SmallVector<std::pair<PartialReductionChain, unsigned>>>
ChainsByPhi;
- for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
- getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
- ChainsByPhi[Phi]);
+ for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
+ if (Instruction *RdxExitInstr = RdxDesc.getLoopExitInstr())
+ getScaledReductions(Phi, RdxExitInstr, Range, ChainsByPhi[Phi]);
+ }
// A partial reduction is invalid if any of its extends are used by
// something that isn't another partial reduction. This is because the
@@ -8221,7 +8227,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
PhiRecipe = new VPReductionPHIRecipe(
Phi, RdxDesc.getRecurrenceKind(), *StartV,
getReductionStyle(UseInLoopReduction, UseOrderedReductions,
- ScaleFactor));
+ ScaleFactor),
+ RdxDesc.hasUsesOutsideReductionChain());
} else {
// TODO: Currently fixed-order recurrences are modeled as chains of
// first-order recurrences. If there are no users of the intermediate
@@ -8555,6 +8562,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
+ // Apply mandatory transformation to handle reductions with multiple in-loop
+ // uses if possible, bail out otherwise.
+ if (!VPlanTransforms::runPass(VPlanTransforms::handleMultiUseReductions,
+ *Plan))
+ return nullptr;
// Apply mandatory transformation to handle FP maxnum/minnum reduction with
// NaNs if possible, bail out otherwise.
if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a464d019754ba..6ca750fc53279 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2071,6 +2071,9 @@ class LLVM_ABI_FOR_TEST VPHeaderPHIRecipe : public VPSingleDefRecipe,
static inline bool classof(const VPValue *V) {
return isa<VPHeaderPHIRecipe>(V->getDefiningRecipe());
}
+ static inline bool classof(const VPSingleDefRecipe *R) {
+ return isa<VPHeaderPHIRecipe>(static_cast<const VPRecipeBase *>(R));
+ }
/// Generate the phi nodes.
void execute(VPTransformState &State) override = 0;
@@ -2136,7 +2139,7 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
return R && classof(R);
}
- static inline bool classof(const VPHeaderPHIRecipe *R) {
+ static inline bool classof(const VPSingleDefRecipe *R) {
return classof(static_cast<const VPRecipeBase *>(R));
}
@@ -2432,19 +2435,27 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
ReductionStyle Style;
+ /// The phi is part of a multi-use reduction (e.g., used in FindLastIV
+ /// patterns for argmin/argmax).
+ /// TODO: Also support cases where the phi itself has a single use, but its
+ /// compare has multiple uses.
+ bool HasUsesOutsideReductionChain;
+
public:
/// Create a new VPReductionPHIRecipe for the reduction \p Phi.
VPReductionPHIRecipe(PHINode *Phi, RecurKind Kind, VPValue &Start,
- ReductionStyle Style)
+ ReductionStyle Style,
+ bool HasUsesOutsideReductionChain = false)
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), Kind(Kind),
- Style(Style) {}
+ Style(Style),
+ HasUsesOutsideReductionChain(HasUsesOutsideReductionChain) {}
~VPReductionPHIRecipe() override = default;
VPReductionPHIRecipe *clone() override {
auto *R = new VPReductionPHIRecipe(
dyn_cast_or_null<PHINode>(getUnderlyingValue()), getRecurrenceKind(),
- *getOperand(0), Style);
+ *getOperand(0), Style, HasUsesOutsideReductionChain);
R->addOperand(getBackedgeValue());
return R;
}
@@ -2481,6 +2492,11 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Returns true if the reduction outputs a vector with a scaled down VF.
bool isPartialReduction() const { return getVFScaleFactor() > 1; }
+ /// Returns true, if the phi is part of a multi-use reduction.
+ bool hasUsesOutsideReductionChain() const {
+ return HasUsesOutsideReductionChain;
+ }
+
/// Returns true if the recipe only uses the first lane of operand \p Op.
bool usesFirstLaneOnly(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 92969c8ed9ec0..47632dcfc277e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -22,6 +22,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/MDBuilder.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#define DEBUG_TYPE "vplan"
@@ -827,15 +828,18 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
Branch->setMetadata(LLVMContext::MD_prof, BranchWeights);
}
-/// If \p RedPhiR is used by a ComputeReductionResult recipe, return it.
-/// Otherwise return nullptr.
-static VPInstruction *
-findComputeReductionResult(VPReductionPHIRecipe *RedPhiR) {
- auto It = find_if(RedPhiR->users(), [](VPUser *U) {
- auto *VPI = dyn_cast<VPInstruction>(U);
- return VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult;
- });
- return It == RedPhiR->user_end() ? nullptr : cast<VPInstruction>(*It);
+/// If \p V is used by a recipe matching pattern \p P, return it. Otherwise
+/// return nullptr;
+template <typename MatchT>
+static VPRecipeBase *findUserOf(VPValue *V, const MatchT &P) {
+ auto It = find_if(V->users(), match_fn(P));
+ return It == V->user_end() ? nullptr : cast<VPRecipeBase>(*It);
+}
+
+/// If \p V is used by a VPInstruction with \p Opcode, return it. Otherwise
+/// return nullptr.
+template <unsigned Opcode> static VPInstruction *findUserOf(VPValue *V) {
+ return cast_or_null<VPInstruction>(findUserOf(V, m_VPInstruction<Opcode>()));
}
bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
@@ -932,7 +936,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
// If we exit early due to NaNs, compute the final reduction result based on
// the reduction phi at the beginning of the last vector iteration.
- auto *RdxResult = findComputeReductionResult(RedPhiR);
+ auto *RdxResult =
+ findUserOf<VPInstruction::ComputeReductionResult>(RedPhiR);
auto *NewSel = MiddleBuilder.createSelect(AnyNaNLane, RedPhiR,
RdxResult->getOperand(1));
@@ -991,3 +996,154 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
MiddleTerm->setOperand(0, NewCond);
return true;
}
+
+bool VPlanTransforms::handleMultiUseReductions(VPlan &Plan) {
+ for (auto &PhiR : make_early_inc_range(
+ Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis())) {
+ auto *MinMaxPhiR = dyn_cast<VPReductionPHIRecipe>(&PhiR);
+ // TODO: check for multi-uses in VPlan directly.
+ if (!MinMaxPhiR || !MinMaxPhiR->hasUsesOutsideReductionChain())
+ continue;
+
+ // MinMaxPhiR has users outside the reduction cycle in the loop. Check if
+ // the only other user is a FindLastIV reduction. MinMaxPhiR must have
+ // exactly 3 users: 1) the min/max operation, the compare of a FindLastIV
+ // reduction and ComputeReductionResult. The comparisom must compare
+ // MinMaxPhiR against the min/max operand used for the min/max reduction
+ // and only be used by the select of the FindLastIV reduction.
+ RecurKind RdxKind = MinMaxPhiR->getRecurrenceKind();
+ assert(
+ RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind) &&
+ "only min/max recurrences support users outside the reduction chain");
+
+ auto *MinMaxOp =
+ dyn_cast<VPRecipeWithIRFlags>(MinMaxPhiR->getBackedgeValue());
+ if (!MinMaxOp)
+ return false;
+
+ // Check that MinMaxOp is a VPWidenIntrinsicRecipe or VPReplicateRecipe
+ // with an intrinsic that matches the reduction kind.
+ Intrinsic::ID ExpectedIntrinsicID = getMinMaxReductionIntrinsicOp(RdxKind);
+ if (!match(MinMaxOp, m_Intrinsic(ExpectedIntrinsicID)))
+ return false;
+
+ // MinMaxOp must have 2 users: 1) MinMaxPhiR and 2) ComputeReductionResult
+ // (asserted below).
+ assert(MinMaxOp->getNumUsers() == 2 &&
+ "MinMaxOp must have exactly 2 users");
+ VPValue *MinMaxOpValue = MinMaxOp->getOperand(0);
+ if (MinMaxOpValue == MinMaxPhiR)
+ MinMaxOpValue = MinMaxOp->getOperand(1);
+
+ VPValue *CmpOpA;
+ VPValue *CmpOpB;
+ CmpPredicate Pred;
+ auto *Cmp = dyn_cast_or_null<VPRecipeWithIRFlags>(findUserOf(
+ MinMaxPhiR, m_Cmp(Pred, m_VPValue(CmpOpA), m_VPValue(CmpOpB))));
+ if (!Cmp || Cmp->getNumUsers() != 1 ||
+ (CmpOpA != MinMaxOpValue && CmpOpB != MinMaxOpValue))
+ return false;
+
+ if (MinMaxOpValue != CmpOpB)
+ Pred = CmpInst::getSwappedPredicate(Pred);
+
+ // MinMaxPhiR must have exactly 3 users:
+ // * MinMaxOp,
+ // * Cmp (that's part of a FindLastIV chain),
+ // * ComputeReductionResult.
+ if (MinMaxPhiR->getNumUsers() != 3)
+ return false;
+
+ VPInstruction *MinMaxResult =
+ findUserOf<VPInstruction::ComputeReductionResult>(MinMaxPhiR);
+ assert(is_contained(MinMaxPhiR->users(), MinMaxOp) &&
+ "one user must be MinMaxOp");
+ assert(MinMaxResult && "MinMaxResult must be a user of MinMaxPhiR");
+ assert(is_contained(MinMaxOp->users(), MinMaxResult) &&
+ "MinMaxResult must be a user of MinMaxOp (and of MinMaxPhiR");
+
+ // Cmp must be used by the select of a FindLastIV chain.
+ VPValue *Sel = dyn_cast<VPSingleDefRecipe>(Cmp->getSingleUser());
+ VPValue *IVOp, *FindIV;
+ if (!Sel || Sel->getNumUsers() != 2 ||
+ !match(Sel,
+ m_Select(m_Specific(Cmp), m_VPValue(IVOp), m_VPValue(FindIV))))
+ return false;
+
+ if (!isa<VPReductionPHIRecipe>(FindIV)) {
+ std::swap(FindIV, IVOp);
+ Pred = CmpInst::getInversePredicate(Pred);
+ }
+
+ auto *FindIVPhiR = dyn_cast<VPReductionPHIRecipe>(FindIV);
+ if (!FindIVPhiR || !RecurrenceDescriptor::isFindLastIVRecurrenceKind(
+ FindIVPhiR->getRecurrenceKind()))
+ return false;
+
+ assert(match(IVOp, m_TruncOrSelf(m_VPValue(IVOp))) &&
+ isa<VPWidenIntOrFpInductionRecipe>(IVOp) &&
+ "other select operand must be a (truncated) wide induction");
+
+ CmpInst::Predicate RdxPredicate = [RdxKind]() {
+ switch (RdxKind) {
+ case RecurKind::UMin:
+ return CmpInst::ICMP_UGE;
+ case RecurKind::UMax:
+ return CmpInst::ICMP_ULE;
+ case RecurKind::SMax:
+ return CmpInst::ICMP_SLE;
+ case RecurKind::SMin:
+ return CmpInst::ICMP_SGE;
+ default:
+ llvm_unreachable("unhandled recurrence kind");
+ }
+ }();
+
+ // TODO: Strict predicates need to find the first IV value for which the
+ // predicate holds, not the last.
+ if (Pred != RdxPredicate)
+ return false;
+
+ assert(!FindIVPhiR->isInLoop() && !FindIVPhiR->isOrdered() &&
+ "cannot handle inloop/ordered reductions yet");
+
+ // The reduction using MinMaxPhiR needs adjusting to compute the correct
+ // result:
+ // 1. We need to find the last IV for which the condition based on the
+ // min/max recurrence is true,
+ // 2. Compare the partial min/max reduction result to its final value and,
+ // 3. Select the lanes of the partial FindLastIV reductions which
+ // correspond to the lanes matching the min/max reduction result.
+ //
+ // For example, this transforms
+ // vp<%min.result> = compute-reduction-result ir<%min.val>,
+ // ir<%min.val.next>
+ // vp<%find.iv.result = compute-find-iv-result ir<%min.idx>, ir<0>,
+ // SENTINEL, vp<%min.idx.next>
+ //
+ // into:
+ //
+ // vp<min.result> = compute-reduction-result ir<%min.val>, ir<%min.val.next>
+ // vp<%final.min.cmp> = icmp eq ir<%min.val.next>, vp<min.result>
+ // vp<%final.iv> = select vp<%final.min.cmp>, ir<%min.idx.next>, SENTINEL
+ // vp<%find.iv.result> = compute-find-iv-result ir<%min.idx>, ir<0>,
+ // SENTINEL, vp<%final.iv>
+ VPInstruction *FindIVResult =
+ findUserOf<VPInstruction::ComputeFindIVResult>(FindIVPhiR);
+ assert(FindIVResult->getParent() == MinMaxResult->getParent() &&
+ "both results must be computed in the same block");
+ MinMaxResult->moveBefore(*FindIVResult->getParent(),
+ FindIVResult->getIterator());
+
+ VPBuilder B(FindIVResult);
+ VPValue *MinMaxExiting = MinMaxResult->getOperand(1);
+ auto *FinalMinMaxCmp =
+ B.createICmp(CmpInst::ICMP_EQ, MinMaxExiting, MinMaxResult);
+ VPValue *Sentinel = FindIVResult->getOperand(2);
+ VPValue *LastIVExiting = FindIVResult->getOperand(3);
+ auto *FinalIVSelect =
+ B.createSelect(FinalMinMaxCmp, LastIVExiting, Sentinel);
+ FindIVResult->setOperand(3, FinalIVSelect);
+ }
+ return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 07dfe31eea46d..750ef8edd94bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -468,6 +468,12 @@ inline AllRecipe_match<Instruction::Trunc, Op0_t> m_Trunc(const Op0_t &Op0) {
return m_Unary<Instruction::Trunc, Op0_t>(Op0);
}
+template <typename Op0_t>
+inline match_combine_or<AllRecipe_match<Instruction::Trunc, Op0_t>, Op0_t>
+m_TruncOrSelf(const Op0_t &Op0) {
+ return m_CombineOr(m_Trunc(Op0), Op0);
+}
+
template <typename Op0_t>
inline AllRecipe_match<Instruction::ZExt, Op0_t> m_ZExt(const Op0_t &Op0) {
return m_Unary<Instruction::ZExt, Op0_t>(Op0);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 6245a5107a5d0..ae3797dee1f07 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -145,6 +145,11 @@ struct VPlanTransforms {
GetIntOrFpInductionDescriptor,
const TargetLibraryInfo &TLI);
+ /// Try to legalize reductions with multiple in-loop uses. Currently only
+ /// min/max reductions used by FindLastIV reductions are supported. Otherwise
+ /// return false.
+ static bool handleMultiUseReductions(VPlan &Plan);
+
/// Try to have all users of fixed-order recurrences appear after the recipe
/// defining their previous value, by either sinking users or hoisting recipes
/// defining their previous value (and its operands). Then introduce
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-index.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-index.ll
index 32d419cc0934a..56d34a61be1db 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/select-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-index.ll
@@ -47,11 +47,58 @@ define i64 @test_vectorize_select_umin_last_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_umin_last_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ splat (i64 100), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ splat (i64 100), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[GEP]], i64 2
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[GEP]], align 8
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp uge <2 x i64> [[VEC_PHI2]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge <2 x i64> [[VEC_PHI3]], [[WIDE_LOAD4]]
+; CHECK-NEXT: [[TMP5]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[VEC_PHI2]], <2 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP6]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[VEC_PHI3]], <2 x i64> [[WIDE_LOAD4]])
+; CHECK-NEXT: [[TMP7]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_IND]], <2 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP8]] = select <2 x i1> [[TMP4]], <2 x i64> [[STEP_ADD]], <2 x i64> [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> [[RDX_MINMAX]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <2 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <2 x i64> [[TMP6]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP11]], <2 x i64> [[TMP7]], <2 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP12]], <2 x i64> [[TMP8]], <2 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> [[RDX_MINMAX5]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 100, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 100, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 8
; CHECK-NEXT: [[CMP:%.*]] = icmp uge i64 [[MIN_VAL]], [[L]]
@@ -59,9 +106,9 @@ define i64 @test_vectorize_select_umin_last_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -131,11 +178,58 @@ define i64 @test_vectorize_select_smin_last_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_smin_last_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[GEP]], i64 2
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[GEP]], align 8
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i64> [[VEC_PHI2]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp sge <2 x i64> [[VEC_PHI3]], [[WIDE_LOAD4]]
+; CHECK-NEXT: [[TMP5]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[VEC_PHI2]], <2 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP6]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[VEC_PHI3]], <2 x i64> [[WIDE_LOAD4]])
+; CHECK-NEXT: [[TMP7]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_IND]], <2 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP8]] = select <2 x i1> [[TMP4]], <2 x i64> [[STEP_ADD]], <2 x i64> [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> [[RDX_MINMAX]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <2 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <2 x i64> [[TMP6]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP11]], <2 x i64> [[TMP7]], <2 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP12]], <2 x i64> [[TMP8]], <2 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> [[RDX_MINMAX5]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 8
; CHECK-NEXT: [[CMP:%.*]] = icmp sge i64 [[MIN_VAL]], [[L]]
@@ -143,9 +237,9 @@ define i64 @test_vectorize_select_smin_last_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -215,11 +309,58 @@ define i64 @test_vectorize_select_umax_last_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_umax_last_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[GEP]], i64 2
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[GEP]], align 8
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <2 x i64> [[VEC_PHI2]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <2 x i64> [[VEC_PHI3]], [[WIDE_LOAD4]]
+; CHECK-NEXT: [[TMP5]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[VEC_PHI2]], <2 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP6]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[VEC_PHI3]], <2 x i64> [[WIDE_LOAD4]])
+; CHECK-NEXT: [[TMP7]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_IND]], <2 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP8]] = select <2 x i1> [[TMP4]], <2 x i64> [[STEP_ADD]], <2 x i64> [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> [[RDX_MINMAX]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <2 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <2 x i64> [[TMP6]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP11]], <2 x i64> [[TMP7]], <2 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP12]], <2 x i64> [[TMP8]], <2 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> [[RDX_MINMAX5]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 8
; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[MIN_VAL]], [[L]]
@@ -227,9 +368,9 @@ define i64 @test_vectorize_select_umax_last_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -299,11 +440,58 @@ define i64 @test_vectorize_select_smax_last_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_smax_last_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[GEP]], i64 2
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[GEP]], align 8
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sle <2 x i64> [[VEC_PHI2]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp sle <2 x i64> [[VEC_PHI3]], [[WIDE_LOAD4]]
+; CHECK-NEXT: [[TMP5]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[VEC_PHI2]], <2 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP6]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[VEC_PHI3]], <2 x i64> [[WIDE_LOAD4]])
+; CHECK-NEXT: [[TMP7]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_IND]], <2 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP8]] = select <2 x i1> [[TMP4]], <2 x i64> [[STEP_ADD]], <2 x i64> [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> [[RDX_MINMAX]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <2 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <2 x i64> [[TMP6]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP11]], <2 x i64> [[TMP7]], <2 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP12]], <2 x i64> [[TMP8]], <2 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> [[RDX_MINMAX5]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 8
; CHECK-NEXT: [[CMP:%.*]] = icmp sle i64 [[MIN_VAL]], [[L]]
@@ -311,9 +499,9 @@ define i64 @test_vectorize_select_smax_last_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -383,22 +571,71 @@ define i32 @test_multi_use_reduction_with_trunc_iv(ptr %src, i32 %n) {
; CHECK-NEXT: [[PRE:%.*]] = icmp eq i32 [[N]], 0
; CHECK-NEXT: br i1 [[PRE]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
; CHECK: [[LOOP_PREHEADER]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_EXT]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_EXT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_EXT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 1, [[N_VEC]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1, i32 2, i32 3, i32 4>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[IV:%.*]] = add i64 1, [[INDEX]]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[GEP_SRC]], i64 4
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i32> [[WIDE_LOAD]], [[VEC_PHI2]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <4 x i32> [[WIDE_LOAD4]], [[VEC_PHI3]]
+; CHECK-NEXT: [[TMP5]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI2]])
+; CHECK-NEXT: [[TMP6]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[WIDE_LOAD4]], <4 x i32> [[VEC_PHI3]])
+; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[VEC_IND]]
+; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI1]], <4 x i32> [[STEP_ADD]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[RDX_MINMAX]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <4 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP7]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP8]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[RDX_MINMAX5]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP15]], i32 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 1, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 1, %[[LOOP_PREHEADER]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i32 [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i32 [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
-; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i32 [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i32 [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC1]], align 4
; CHECK-NEXT: [[C_0:%.*]] = icmp ugt i32 [[L]], [[MIN_VAL]]
; CHECK-NEXT: [[MIN_VAL_NEXT]] = tail call i32 @llvm.umin.i32(i32 [[L]], i32 [[MIN_VAL]])
-; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV1]] to i32
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[C_0]], i32 [[MIN_IDX]], i32 [[IV_TRUNC]]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N_EXT]]
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV1]], [[N_EXT]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT: [[MIN_IDX_NEXT_LCSSA:%.*]] = phi i32 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX_NEXT_LCSSA:%.*]] = phi i32 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label %[[EXIT]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/select-index-interleaving.ll b/llvm/test/Transforms/LoopVectorize/select-index-interleaving.ll
index 9d97c7f9cec09..1638360d08e99 100644
--- a/llvm/test/Transforms/LoopVectorize/select-index-interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-index-interleaving.ll
@@ -47,11 +47,58 @@ define i64 @test_vectorize_select_umin_last_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_umin_last_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 50), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 50), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[GEP]], i64 4
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP]], align 4
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp uge <4 x i64> [[VEC_PHI2]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge <4 x i64> [[VEC_PHI3]], [[WIDE_LOAD4]]
+; CHECK-NEXT: [[TMP5]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI2]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP6]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI3]], <4 x i64> [[WIDE_LOAD4]])
+; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[RDX_MINMAX]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP7]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP8]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP13]], <4 x i64> [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX5]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 50, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 50, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp uge i64 [[MIN_VAL]], [[L]]
@@ -59,9 +106,9 @@ define i64 @test_vectorize_select_umin_last_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -131,11 +178,58 @@ define i64 @test_vectorize_select_smin_last_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_smin_last_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[GEP]], i64 4
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP]], align 4
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <4 x i64> [[VEC_PHI2]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp sge <4 x i64> [[VEC_PHI3]], [[WIDE_LOAD4]]
+; CHECK-NEXT: [[TMP5]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[VEC_PHI2]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP6]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[VEC_PHI3]], <4 x i64> [[WIDE_LOAD4]])
+; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[RDX_MINMAX]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP7]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP8]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP13]], <4 x i64> [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX5]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp sge i64 [[MIN_VAL]], [[L]]
@@ -143,9 +237,9 @@ define i64 @test_vectorize_select_smin_last_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -215,11 +309,58 @@ define i64 @test_vectorize_select_umax_last_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_umax_last_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[GEP]], i64 4
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP]], align 4
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <4 x i64> [[VEC_PHI2]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i64> [[VEC_PHI3]], [[WIDE_LOAD4]]
+; CHECK-NEXT: [[TMP5]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[VEC_PHI2]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP6]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[VEC_PHI3]], <4 x i64> [[WIDE_LOAD4]])
+; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[RDX_MINMAX]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP7]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP8]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP13]], <4 x i64> [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX5]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[MIN_VAL]], [[L]]
@@ -227,9 +368,9 @@ define i64 @test_vectorize_select_umax_last_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -299,11 +440,58 @@ define i64 @test_vectorize_select_smax_last_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_smax_last_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[GEP]], i64 4
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP]], align 4
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sle <4 x i64> [[VEC_PHI2]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp sle <4 x i64> [[VEC_PHI3]], [[WIDE_LOAD4]]
+; CHECK-NEXT: [[TMP5]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI2]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP6]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI3]], <4 x i64> [[WIDE_LOAD4]])
+; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP7]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP12]], <4 x i64> [[TMP8]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP13]], <4 x i64> [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[RDX_MINMAX5]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP15]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP15]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX6]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp sle i64 [[MIN_VAL]], [[L]]
@@ -311,9 +499,9 @@ define i64 @test_vectorize_select_smax_last_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/select-smax-last-index.ll b/llvm/test/Transforms/LoopVectorize/select-smax-last-index.ll
index 0e27efd788fd6..a6ff83e926140 100644
--- a/llvm/test/Transforms/LoopVectorize/select-smax-last-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-smax-last-index.ll
@@ -5,11 +5,46 @@ define i64 @test_vectorize_select_smax_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_smax_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sle <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP6]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP4]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP8]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp sle i64 [[MIN_VAL]], [[L]]
@@ -17,9 +52,9 @@ define i64 @test_vectorize_select_smax_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MAX_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -47,11 +82,46 @@ define i64 @test_vectorize_select_smax_idx_cond_flipped(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_smax_idx_cond_flipped(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sge <4 x i64> [[WIDE_LOAD]], [[VEC_PHI1]]
+; CHECK-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP6]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP4]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP8]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp sge i64 [[L]], [[MIN_VAL]]
@@ -59,9 +129,9 @@ define i64 @test_vectorize_select_smax_idx_cond_flipped(ptr %src, i64 %n) {
; CHECK-NEXT: [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MAX_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -219,11 +289,46 @@ define i64 @test_vectorize_select_smax_idx_min_ops_switched(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_smax_idx_min_ops_switched(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sle <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP3]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[WIDE_LOAD]], <4 x i64> [[VEC_PHI1]])
+; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP6]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP4]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP8]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp sle i64 [[MIN_VAL]], [[L]]
@@ -231,9 +336,9 @@ define i64 @test_vectorize_select_smax_idx_min_ops_switched(ptr %src, i64 %n) {
; CHECK-NEXT: [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MAX_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/select-smin-last-index.ll b/llvm/test/Transforms/LoopVectorize/select-smin-last-index.ll
index f9ef340a3e2f8..370ed866df4cd 100644
--- a/llvm/test/Transforms/LoopVectorize/select-smin-last-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-smin-last-index.ll
@@ -7,11 +7,46 @@ define i64 @test_vectorize_select_smin_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_smin_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sge <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP2]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP2]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP3]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP7]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp sge i64 [[MIN_VAL]], [[L]]
@@ -19,9 +54,9 @@ define i64 @test_vectorize_select_smin_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -49,11 +84,46 @@ define i64 @test_vectorize_select_smin_idx_cond_flipped(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_smin_idx_cond_flipped(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sle <4 x i64> [[WIDE_LOAD]], [[VEC_PHI1]]
+; CHECK-NEXT: [[TMP2]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP2]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP3]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP7]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp sle i64 [[L]], [[MIN_VAL]]
@@ -61,9 +131,9 @@ define i64 @test_vectorize_select_smin_idx_cond_flipped(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -221,11 +291,46 @@ define i64 @test_vectorize_select_smin_idx_min_ops_switched(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_smin_idx_min_ops_switched(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sge <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP2]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[WIDE_LOAD]], <4 x i64> [[VEC_PHI1]])
+; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP2]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP3]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP7]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp sge i64 [[MIN_VAL]], [[L]]
@@ -233,9 +338,9 @@ define i64 @test_vectorize_select_smin_idx_min_ops_switched(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/select-umax-last-index.ll b/llvm/test/Transforms/LoopVectorize/select-umax-last-index.ll
index 54281daf26790..b7638dda4a037 100644
--- a/llvm/test/Transforms/LoopVectorize/select-umax-last-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-umax-last-index.ll
@@ -5,11 +5,46 @@ define i64 @test_vectorize_select_umax_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_umax_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP3]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP6]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP4]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP8]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[MIN_VAL]], [[L]]
@@ -17,9 +52,9 @@ define i64 @test_vectorize_select_umax_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MAX_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -47,11 +82,46 @@ define i64 @test_vectorize_select_umax_idx_cond_flipped(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_umax_idx_cond_flipped(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge <4 x i64> [[WIDE_LOAD]], [[VEC_PHI1]]
+; CHECK-NEXT: [[TMP3]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP6]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP4]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP8]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp uge i64 [[L]], [[MIN_VAL]]
@@ -59,9 +129,9 @@ define i64 @test_vectorize_select_umax_idx_cond_flipped(ptr %src, i64 %n) {
; CHECK-NEXT: [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MAX_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -219,11 +289,46 @@ define i64 @test_vectorize_select_umax_idx_min_ops_switched(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_umax_idx_min_ops_switched(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP3]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[WIDE_LOAD]], <4 x i64> [[VEC_PHI1]])
+; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP6]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP4]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP8]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[MIN_VAL]], [[L]]
@@ -231,9 +336,9 @@ define i64 @test_vectorize_select_umax_idx_min_ops_switched(ptr %src, i64 %n) {
; CHECK-NEXT: [[MAX_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MAX_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MAX_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/select-umin-last-index.ll b/llvm/test/Transforms/LoopVectorize/select-umin-last-index.ll
index da5ff7246a0c0..407a27054d764 100644
--- a/llvm/test/Transforms/LoopVectorize/select-umin-last-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-umin-last-index.ll
@@ -7,11 +7,46 @@ define i64 @test_vectorize_select_umin_idx(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_umin_idx(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 140), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP3]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP3]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP6]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP4]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP8]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP9]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP9]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 140, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 140, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp uge i64 [[MIN_VAL]], [[L]]
@@ -19,9 +54,9 @@ define i64 @test_vectorize_select_umin_idx(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -49,11 +84,46 @@ define i64 @test_vectorize_select_umin_idx_cond_flipped(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_umin_idx_cond_flipped(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 130), %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[VEC_PHI1]]
+; CHECK-NEXT: [[TMP2]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP2]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP3]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP7]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 130, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 130, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[L]], [[MIN_VAL]]
@@ -61,9 +131,9 @@ define i64 @test_vectorize_select_umin_idx_cond_flipped(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -91,11 +161,46 @@ define i64 @test_vectorize_select_umin_idx_select_ops_flipped(ptr %src, i64 %n)
; CHECK-LABEL: define i64 @test_vectorize_select_umin_idx_select_ops_flipped(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 120), %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[WIDE_LOAD]], [[VEC_PHI1]]
+; CHECK-NEXT: [[TMP2]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[VEC_PHI1]], <4 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV1]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP2]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP3]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP7]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 120, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 120, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[L]], [[MIN_VAL]]
@@ -103,9 +208,9 @@ define i64 @test_vectorize_select_umin_idx_select_ops_flipped(ptr %src, i64 %n)
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[MIN_IDX]], i64 [[IV]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
@@ -221,11 +326,46 @@ define i64 @test_vectorize_select_umin_idx_min_ops_switched(ptr %src, i64 %n) {
; CHECK-LABEL: define i64 @test_vectorize_select_umin_idx_min_ops_switched(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 90), %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_PHI1]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP2]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[WIDE_LOAD]], <4 x i64> [[VEC_PHI1]])
+; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP2]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP3]], <4 x i64> splat (i64 -9223372036854775808)
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP7]])
+; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP8]], -9223372036854775808
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP8]], i64 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 90, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ 90, %[[ENTRY]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_IDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MIN_IDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MIN_VAL:%.*]] = phi i64 [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[MIN_VAL_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV1]]
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP1]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp uge i64 [[MIN_VAL]], [[L]]
@@ -233,9 +373,9 @@ define i64 @test_vectorize_select_umin_idx_min_ops_switched(ptr %src, i64 %n) {
; CHECK-NEXT: [[MIN_IDX_NEXT]] = select i1 [[CMP]], i64 [[IV1]], i64 [[MIN_IDX]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT]]:
-; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[MIN_IDX_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
entry:
>From d8f660d053a01e904e2baffe6ef9a0811c5ca28f Mon Sep 17 00:00:00 2001
From: Luo Yuanke <lyk_03 at hotmail.com>
Date: Sat, 29 Nov 2025 07:27:19 +0800
Subject: [PATCH 4/7] [RegAlloc] Relax the split constrain on MBB prolog
(#168259)
https://reviews.llvm.org/D52052 is to prevent register split on the MBB
which have prolog instructions defining the exec register (or mask register
that activate the threads of a warp in GPU). The constrain seems too
strict, because 1) If the split is allowed, it may fit the free live range
of a physical register, and no spill will happen; 2) The register class of
register that is under splitting may not be the same to the register that
is defined in prolog, so there is no interference with the register being
defined in prolog.
The current code has another small issue. The MBB->getFirstNonDebugInstr()
just skip debug instructions, but SA->getFirstSplitPoint(Number) would skip
label and phi instructions. This cause some MBB with label instruction
being taken as prolog.
This patch is to relax the split constrain on MMB with prolog by checking
if the register defined in prolog has the common register class with the
register being split. It allow the split if the register defined in prolog
is physical register or there is no common register class.
---------
Co-authored-by: Yuanke Luo <ykluo at birentech.com>
---
llvm/lib/CodeGen/RegAllocGreedy.cpp | 9 +-
llvm/lib/CodeGen/SplitKit.cpp | 48 +
llvm/lib/CodeGen/SplitKit.h | 8 +
.../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 5369 +++++++++--------
.../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 187 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 293 +-
.../ran-out-of-sgprs-allocation-failure.mir | 62 +-
.../test/CodeGen/AMDGPU/spill-before-exec.mir | 5 +
.../CodeGen/AMDGPU/spill-before-exec2.mir | 167 +
.../CodeGen/AMDGPU/spill-scavenge-offset.ll | 537 +-
.../CodeGen/X86/2008-04-17-CoalescerBug.ll | 78 +-
11 files changed, 3519 insertions(+), 3244 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index a059cb55371a3..4db20dc39fb32 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -774,8 +774,7 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
// Abort if the spill cannot be inserted at the MBB' start
if (((BC.Entry == SpillPlacement::MustSpill) ||
(BC.Entry == SpillPlacement::PrefSpill)) &&
- SlotIndex::isEarlierInstr(BI.FirstInstr,
- SA->getFirstSplitPoint(BC.Number)))
+ !SA->canSplitBeforeProlog(BC.Number))
return false;
}
@@ -830,11 +829,7 @@ bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
BCS[B].Number = Number;
// Abort if the spill cannot be inserted at the MBB' start
- MachineBasicBlock *MBB = MF->getBlockNumbered(Number);
- auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr();
- if (FirstNonDebugInstr != MBB->end() &&
- SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr),
- SA->getFirstSplitPoint(Number)))
+ if (!SA->canSplitBeforeProlog(Number))
return false;
// Interference for the live-in value.
if (Intf.first() <= Indexes->getMBBStartIdx(Number))
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index 8ec4bfbb5a330..f27ff674dcf8c 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -147,6 +147,54 @@ InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
return LIS.getInstructionFromIndex(LIP);
}
+bool InsertPointAnalysis::canSplitBeforeProlog(const LiveInterval &CurLI,
+ const MachineBasicBlock &MBB) {
+ const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
+
+ for (auto &MI : MBB) {
+ if (MI.isPHI() || MI.isPosition() || MI.isDebugInstr() ||
+ MI.isPseudoProbe())
+ continue;
+
+ if (!TII->isBasicBlockPrologue(MI))
+ return true;
+
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
+ continue;
+
+ // For the AMDGPU target if a MBB contains exec mask restore preamble,
+ // SplitEditor may get state when it cannot insert a spill instruction
+ // at the begin of the MBB.
+ // E.g. for a MIR
+ // bb.100:
+ // %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
+ // implicit $exec
+ // ...
+ // use %1
+ // If the regalloc try to allocate a virtreg to the physreg already
+ // assigned to virtreg %1 and the pyhsreg is computed as the best
+ // candidate for split, it may insert COPY instruction.
+ // bb.100:
+ // %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
+ // implicit $exec
+ // %2 = COPY %orig
+ // ...
+ // use %1
+ // Thus %1 and %orig still have interference. We may add cost for the
+ // physreg candidate or abandon the candidate.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
+ const TargetRegisterClass *CurRC = MRI.getRegClass(CurLI.reg());
+ if (TRI->getCommonSubClass(RC, CurRC))
+ return false;
+ }
+ }
+
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// Split Analysis
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index de255911268f2..a9fc921534d0e 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -89,6 +89,9 @@ class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis {
return Res;
}
+ /// Return true if we can split \pCurLI before \pMBB's prolog.
+ bool canSplitBeforeProlog(const LiveInterval &CurLI,
+ const MachineBasicBlock &MBB);
};
/// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
@@ -247,6 +250,11 @@ class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
SlotIndex getFirstSplitPoint(unsigned Num) {
return IPA.getFirstInsertPoint(*MF.getBlockNumbered(Num));
}
+
+ bool canSplitBeforeProlog(unsigned Num) {
+ MachineBasicBlock *BB = MF.getBlockNumbered(Num);
+ return IPA.canSplitBeforeProlog(*CurLI, *BB);
+ }
};
/// SplitEditor - Edit machine code and LiveIntervals for live range
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 4c5c56a49fdc6..10f7b701c3122 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -151238,13 +151238,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304
; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
-; SI-NEXT: s_mov_b32 s73, s21
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_writelane_b32 v41, s30, 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v44, s19, 0
; SI-NEXT: v_writelane_b32 v44, s18, 1
; SI-NEXT: v_writelane_b32 v44, s17, 2
; SI-NEXT: v_writelane_b32 v44, s16, 3
-; SI-NEXT: v_writelane_b32 v41, s30, 0
; SI-NEXT: v_writelane_b32 v41, s31, 1
; SI-NEXT: v_writelane_b32 v41, s34, 2
; SI-NEXT: v_writelane_b32 v41, s35, 3
@@ -151268,9 +151268,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s69, 21
; SI-NEXT: v_writelane_b32 v41, s70, 22
; SI-NEXT: v_writelane_b32 v41, s71, 23
-; SI-NEXT: s_mov_b32 s74, s29
-; SI-NEXT: s_mov_b32 s78, s28
-; SI-NEXT: s_mov_b32 s76, s27
+; SI-NEXT: s_mov_b32 s57, s28
+; SI-NEXT: s_mov_b32 s47, s27
; SI-NEXT: v_writelane_b32 v41, s80, 24
; SI-NEXT: v_writelane_b32 v41, s81, 25
; SI-NEXT: v_writelane_b32 v41, s82, 26
@@ -151280,7 +151279,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_writelane_b32 v41, s86, 30
; SI-NEXT: v_writelane_b32 v41, s87, 31
; SI-NEXT: v_writelane_b32 v41, s96, 32
-; SI-NEXT: s_mov_b32 s47, s26
; SI-NEXT: v_writelane_b32 v41, s97, 33
; SI-NEXT: v_writelane_b32 v41, s98, 34
; SI-NEXT: v_writelane_b32 v41, s99, 35
@@ -151290,95 +151288,101 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
+; SI-NEXT: v_readfirstlane_b32 s89, v3
+; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; SI-NEXT: v_readfirstlane_b32 s90, v9
+; SI-NEXT: v_writelane_b32 v42, s89, 0
+; SI-NEXT: v_readfirstlane_b32 s91, v10
+; SI-NEXT: v_writelane_b32 v42, s90, 1
+; SI-NEXT: v_readfirstlane_b32 s92, v8
+; SI-NEXT: v_writelane_b32 v42, s91, 2
+; SI-NEXT: v_readfirstlane_b32 s93, v7
+; SI-NEXT: v_writelane_b32 v42, s92, 3
+; SI-NEXT: v_readfirstlane_b32 s94, v13
+; SI-NEXT: v_writelane_b32 v42, s93, 4
+; SI-NEXT: v_readfirstlane_b32 s95, v14
+; SI-NEXT: v_writelane_b32 v42, s94, 5
+; SI-NEXT: v_writelane_b32 v42, s95, 6
+; SI-NEXT: v_readfirstlane_b32 s30, v17
+; SI-NEXT: v_readfirstlane_b32 s31, v18
+; SI-NEXT: v_readfirstlane_b32 s34, v16
+; SI-NEXT: v_readfirstlane_b32 s35, v15
+; SI-NEXT: v_readfirstlane_b32 s36, v21
; SI-NEXT: v_readfirstlane_b32 s37, v22
-; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
; SI-NEXT: v_readfirstlane_b32 s38, v20
-; SI-NEXT: v_writelane_b32 v43, s37, 0
; SI-NEXT: v_readfirstlane_b32 s39, v19
-; SI-NEXT: v_writelane_b32 v43, s38, 1
; SI-NEXT: v_readfirstlane_b32 s48, v25
-; SI-NEXT: v_writelane_b32 v43, s39, 2
; SI-NEXT: v_readfirstlane_b32 s49, v26
-; SI-NEXT: v_writelane_b32 v43, s48, 3
; SI-NEXT: v_readfirstlane_b32 s50, v24
-; SI-NEXT: v_writelane_b32 v43, s49, 4
; SI-NEXT: v_readfirstlane_b32 s51, v23
-; SI-NEXT: v_writelane_b32 v43, s50, 5
; SI-NEXT: v_readfirstlane_b32 s52, v29
-; SI-NEXT: v_writelane_b32 v43, s51, 6
; SI-NEXT: v_readfirstlane_b32 s53, v30
-; SI-NEXT: v_writelane_b32 v43, s52, 7
-; SI-NEXT: v_readfirstlane_b32 s54, v28
-; SI-NEXT: v_writelane_b32 v43, s53, 8
-; SI-NEXT: v_readfirstlane_b32 s55, v27
-; SI-NEXT: v_writelane_b32 v43, s54, 9
-; SI-NEXT: v_writelane_b32 v43, s55, 10
-; SI-NEXT: s_mov_b32 s57, s24
-; SI-NEXT: v_readfirstlane_b32 s16, v1
-; SI-NEXT: v_readfirstlane_b32 s17, v2
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s6, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280
+; SI-NEXT: v_writelane_b32 v44, s4, 4
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v32
-; SI-NEXT: v_writelane_b32 v44, s4, 4
+; SI-NEXT: v_writelane_b32 v44, s4, 5
; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272
-; SI-NEXT: v_writelane_b32 v44, s4, 5
-; SI-NEXT: v_readfirstlane_b32 s4, v34
; SI-NEXT: v_writelane_b32 v44, s4, 6
-; SI-NEXT: v_readfirstlane_b32 s4, v35
+; SI-NEXT: v_readfirstlane_b32 s4, v34
; SI-NEXT: v_writelane_b32 v44, s4, 7
+; SI-NEXT: v_readfirstlane_b32 s4, v35
+; SI-NEXT: v_writelane_b32 v44, s4, 8
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v36
-; SI-NEXT: v_writelane_b32 v44, s4, 8
+; SI-NEXT: v_writelane_b32 v44, s4, 9
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT: v_writelane_b32 v44, s4, 9
+; SI-NEXT: v_writelane_b32 v44, s4, 10
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v44, s4, 10
+; SI-NEXT: v_writelane_b32 v44, s4, 11
+; SI-NEXT: v_readfirstlane_b32 s54, v28
+; SI-NEXT: v_readfirstlane_b32 s55, v27
+; SI-NEXT: s_mov_b32 s6, s23
+; SI-NEXT: s_mov_b32 s23, s21
+; SI-NEXT: s_mov_b32 s58, s26
+; SI-NEXT: s_mov_b32 s40, s25
+; SI-NEXT: s_mov_b32 s25, s24
+; SI-NEXT: v_readfirstlane_b32 s16, v1
+; SI-NEXT: v_readfirstlane_b32 s17, v2
; SI-NEXT: v_readfirstlane_b32 s18, v5
; SI-NEXT: v_readfirstlane_b32 s19, v6
; SI-NEXT: v_readfirstlane_b32 s77, v4
-; SI-NEXT: v_readfirstlane_b32 s89, v3
-; SI-NEXT: v_readfirstlane_b32 s90, v9
-; SI-NEXT: v_readfirstlane_b32 s91, v10
-; SI-NEXT: v_readfirstlane_b32 s92, v8
-; SI-NEXT: v_readfirstlane_b32 s93, v7
-; SI-NEXT: v_readfirstlane_b32 s94, v13
-; SI-NEXT: v_readfirstlane_b32 s95, v14
-; SI-NEXT: v_readfirstlane_b32 s30, v17
-; SI-NEXT: v_readfirstlane_b32 s31, v18
-; SI-NEXT: v_readfirstlane_b32 s34, v16
-; SI-NEXT: v_readfirstlane_b32 s35, v15
-; SI-NEXT: v_readfirstlane_b32 s36, v21
+; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; SI-NEXT: v_readfirstlane_b32 s26, v53
+; SI-NEXT: v_readfirstlane_b32 s46, v54
+; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_readfirstlane_b32 s61, v55
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s24, v40
+; SI-NEXT: v_readfirstlane_b32 s62, v40
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v44, s4, 11
+; SI-NEXT: v_writelane_b32 v44, s4, 12
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v44, s4, 12
+; SI-NEXT: v_writelane_b32 v44, s4, 13
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v48
-; SI-NEXT: v_writelane_b32 v44, s4, 13
+; SI-NEXT: v_writelane_b32 v44, s4, 14
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v44, s4, 14
+; SI-NEXT: v_writelane_b32 v44, s4, 15
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v50
-; SI-NEXT: v_writelane_b32 v44, s4, 15
+; SI-NEXT: v_writelane_b32 v44, s4, 16
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252
@@ -151388,40 +151392,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228
+; SI-NEXT: v_writelane_b32 v44, s4, 17
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s75, v32
+; SI-NEXT: v_readfirstlane_b32 s4, v32
+; SI-NEXT: v_writelane_b32 v44, s4, 18
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s21, v33
+; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT: v_writelane_b32 v44, s4, 16
+; SI-NEXT: v_writelane_b32 v44, s4, 19
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
+; SI-NEXT: v_writelane_b32 v44, s4, 20
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s40, v35
+; SI-NEXT: v_readfirstlane_b32 s4, v35
+; SI-NEXT: v_writelane_b32 v44, s4, 21
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s61, v36
+; SI-NEXT: v_readfirstlane_b32 s4, v36
+; SI-NEXT: v_writelane_b32 v44, s4, 22
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s63, v37
+; SI-NEXT: v_readfirstlane_b32 s4, v37
+; SI-NEXT: v_writelane_b32 v44, s4, 23
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204
-; SI-NEXT: v_writelane_b32 v44, s4, 17
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s59, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_writelane_b32 v44, s4, 24
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s56, v38
+; SI-NEXT: v_readfirstlane_b32 s4, v38
+; SI-NEXT: v_writelane_b32 v44, s4, 25
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s43, v39
+; SI-NEXT: v_readfirstlane_b32 s4, v39
+; SI-NEXT: v_writelane_b32 v44, s4, 26
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s46, v48
+; SI-NEXT: v_readfirstlane_b32 s4, v48
+; SI-NEXT: v_writelane_b32 v44, s4, 27
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s42, v49
+; SI-NEXT: v_readfirstlane_b32 s4, v49
+; SI-NEXT: v_writelane_b32 v44, s4, 28
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s13, v50
+; SI-NEXT: v_readfirstlane_b32 s4, v50
+; SI-NEXT: v_writelane_b32 v44, s4, 29
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s45, v51
+; SI-NEXT: v_readfirstlane_b32 s4, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192
@@ -151429,45 +151444,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176
+; SI-NEXT: v_writelane_b32 v44, s4, 30
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s88, v32
+; SI-NEXT: v_readfirstlane_b32 s4, v32
+; SI-NEXT: v_writelane_b32 v44, s4, 31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s79, v33
+; SI-NEXT: v_readfirstlane_b32 s4, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168
+; SI-NEXT: v_writelane_b32 v44, s4, 32
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
-; SI-NEXT: v_writelane_b32 v44, s4, 18
+; SI-NEXT: v_writelane_b32 v44, s4, 33
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
-; SI-NEXT: v_writelane_b32 v44, s4, 19
+; SI-NEXT: v_writelane_b32 v44, s4, 34
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v36
-; SI-NEXT: v_writelane_b32 v44, s4, 20
+; SI-NEXT: v_writelane_b32 v44, s4, 35
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s4, v37
-; SI-NEXT: v_writelane_b32 v44, s4, 21
+; SI-NEXT: v_readfirstlane_b32 s43, v37
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v44, s4, 22
+; SI-NEXT: v_writelane_b32 v44, s4, 36
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v44, s4, 23
+; SI-NEXT: v_writelane_b32 v44, s4, 37
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v44, s4, 24
+; SI-NEXT: v_writelane_b32 v44, s4, 38
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_readfirstlane_b32 s4, v48
-; SI-NEXT: v_writelane_b32 v44, s4, 25
+; SI-NEXT: v_writelane_b32 v44, s4, 39
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v44, s4, 26
+; SI-NEXT: v_writelane_b32 v44, s4, 40
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_readfirstlane_b32 s4, v50
-; SI-NEXT: v_writelane_b32 v44, s4, 27
+; SI-NEXT: v_writelane_b32 v44, s4, 41
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s4, v51
-; SI-NEXT: v_writelane_b32 v44, s4, 28
+; SI-NEXT: v_writelane_b32 v44, s4, 42
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
; SI-NEXT: s_waitcnt vmcnt(3)
@@ -151483,41 +151500,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104
-; SI-NEXT: v_writelane_b32 v44, s4, 29
+; SI-NEXT: v_writelane_b32 v44, s4, 43
+; SI-NEXT: v_writelane_b32 v44, s22, 44
+; SI-NEXT: v_writelane_b32 v44, s6, 45
+; SI-NEXT: v_writelane_b32 v44, s23, 46
+; SI-NEXT: v_writelane_b32 v44, s20, 47
+; SI-NEXT: v_writelane_b32 v44, s58, 48
+; SI-NEXT: v_writelane_b32 v44, s47, 49
+; SI-NEXT: v_writelane_b32 v44, s40, 50
+; SI-NEXT: v_writelane_b32 v44, s25, 51
+; SI-NEXT: v_writelane_b32 v44, s29, 52
+; SI-NEXT: v_writelane_b32 v44, s57, 53
+; SI-NEXT: v_writelane_b32 v44, s62, 54
; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_readfirstlane_b32 s4, v52
-; SI-NEXT: v_writelane_b32 v44, s4, 30
-; SI-NEXT: v_readfirstlane_b32 s4, v53
-; SI-NEXT: v_writelane_b32 v44, s4, 31
-; SI-NEXT: v_readfirstlane_b32 s4, v54
-; SI-NEXT: v_writelane_b32 v44, s4, 32
-; SI-NEXT: v_readfirstlane_b32 s4, v55
-; SI-NEXT: v_writelane_b32 v44, s4, 33
-; SI-NEXT: v_writelane_b32 v44, s22, 34
-; SI-NEXT: v_writelane_b32 v44, s23, 35
-; SI-NEXT: v_writelane_b32 v44, s73, 36
-; SI-NEXT: v_writelane_b32 v44, s20, 37
-; SI-NEXT: v_writelane_b32 v44, s47, 38
-; SI-NEXT: v_writelane_b32 v44, s76, 39
-; SI-NEXT: v_writelane_b32 v44, s25, 40
-; SI-NEXT: v_writelane_b32 v44, s57, 41
-; SI-NEXT: v_writelane_b32 v44, s74, 42
-; SI-NEXT: v_writelane_b32 v44, s78, 43
-; SI-NEXT: v_writelane_b32 v44, s24, 44
-; SI-NEXT: v_writelane_b32 v44, s16, 45
-; SI-NEXT: v_writelane_b32 v44, s17, 46
-; SI-NEXT: v_writelane_b32 v44, s18, 47
-; SI-NEXT: v_writelane_b32 v44, s19, 48
-; SI-NEXT: v_writelane_b32 v44, s77, 49
-; SI-NEXT: v_writelane_b32 v44, s89, 50
-; SI-NEXT: v_writelane_b32 v44, s90, 51
-; SI-NEXT: v_writelane_b32 v44, s91, 52
-; SI-NEXT: v_writelane_b32 v44, s92, 53
-; SI-NEXT: v_writelane_b32 v44, s93, 54
-; SI-NEXT: v_writelane_b32 v44, s94, 55
-; SI-NEXT: v_writelane_b32 v44, s95, 56
+; SI-NEXT: v_readfirstlane_b32 s21, v52
+; SI-NEXT: v_writelane_b32 v44, s61, 55
+; SI-NEXT: v_writelane_b32 v44, s21, 56
+; SI-NEXT: v_writelane_b32 v44, s26, 57
+; SI-NEXT: v_writelane_b32 v44, s46, 58
+; SI-NEXT: v_writelane_b32 v44, s16, 59
+; SI-NEXT: v_writelane_b32 v44, s17, 60
+; SI-NEXT: v_writelane_b32 v44, s18, 61
+; SI-NEXT: v_writelane_b32 v44, s19, 62
+; SI-NEXT: v_writelane_b32 v44, s77, 63
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s58, v33
+; SI-NEXT: v_readfirstlane_b32 s13, v33
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s10, v34
; SI-NEXT: s_waitcnt vmcnt(8)
@@ -151525,7 +151532,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s28, v31
; SI-NEXT: v_readfirstlane_b32 s27, v32
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s29, v36
+; SI-NEXT: v_readfirstlane_b32 s56, v36
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s69, v37
; SI-NEXT: s_waitcnt vmcnt(5)
@@ -151556,17 +151563,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12
; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11
-; SI-NEXT: v_writelane_b32 v44, vcc_lo, 57
-; SI-NEXT: v_writelane_b32 v44, vcc_hi, 58
-; SI-NEXT: v_writelane_b32 v44, s30, 59
-; SI-NEXT: v_writelane_b32 v44, s31, 60
-; SI-NEXT: v_writelane_b32 v44, s34, 61
-; SI-NEXT: v_writelane_b32 v44, s35, 62
-; SI-NEXT: v_writelane_b32 v44, s36, 63
+; SI-NEXT: v_writelane_b32 v42, vcc_lo, 7
+; SI-NEXT: v_writelane_b32 v42, vcc_hi, 8
+; SI-NEXT: v_writelane_b32 v42, s30, 9
+; SI-NEXT: v_writelane_b32 v42, s31, 10
+; SI-NEXT: v_writelane_b32 v42, s34, 11
+; SI-NEXT: v_writelane_b32 v42, s35, 12
+; SI-NEXT: v_writelane_b32 v42, s36, 13
+; SI-NEXT: v_writelane_b32 v42, s37, 14
+; SI-NEXT: v_writelane_b32 v42, s38, 15
+; SI-NEXT: v_writelane_b32 v42, s39, 16
+; SI-NEXT: v_writelane_b32 v42, s48, 17
+; SI-NEXT: v_writelane_b32 v42, s49, 18
+; SI-NEXT: v_writelane_b32 v42, s50, 19
+; SI-NEXT: v_writelane_b32 v42, s51, 20
+; SI-NEXT: v_writelane_b32 v42, s52, 21
+; SI-NEXT: v_writelane_b32 v42, s53, 22
+; SI-NEXT: v_writelane_b32 v42, s54, 23
+; SI-NEXT: v_writelane_b32 v42, s55, 24
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s60, v31
+; SI-NEXT: v_readfirstlane_b32 s59, v31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s62, v32
+; SI-NEXT: v_readfirstlane_b32 s63, v32
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s83, v33
; SI-NEXT: s_waitcnt vmcnt(9)
@@ -151590,7 +151608,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s26, v48
+; SI-NEXT: v_readfirstlane_b32 s75, v48
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s15, v49
; SI-NEXT: s_waitcnt vmcnt(9)
@@ -151624,48 +151642,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_readfirstlane_b32 s65, v48
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_readfirstlane_b32 s64, v49
-; SI-NEXT: v_writelane_b32 v43, s64, 11
+; SI-NEXT: v_writelane_b32 v42, s64, 25
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_readfirstlane_b32 s67, v50
-; SI-NEXT: v_writelane_b32 v43, s65, 12
+; SI-NEXT: v_writelane_b32 v42, s65, 26
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s84, v51
-; SI-NEXT: v_writelane_b32 v43, s67, 13
-; SI-NEXT: v_writelane_b32 v43, s84, 14
-; SI-NEXT: v_writelane_b32 v43, s85, 15
-; SI-NEXT: v_writelane_b32 v43, s86, 16
-; SI-NEXT: v_writelane_b32 v43, s87, 17
-; SI-NEXT: v_writelane_b32 v43, s8, 18
-; SI-NEXT: v_writelane_b32 v43, s99, 19
-; SI-NEXT: v_writelane_b32 v43, s12, 20
-; SI-NEXT: v_writelane_b32 v43, s44, 21
-; SI-NEXT: v_writelane_b32 v43, s97, 22
-; SI-NEXT: v_writelane_b32 v43, s15, 23
-; SI-NEXT: v_writelane_b32 v43, s96, 24
-; SI-NEXT: v_writelane_b32 v43, s98, 25
-; SI-NEXT: v_writelane_b32 v43, s83, 26
-; SI-NEXT: v_writelane_b32 v43, s82, 27
-; SI-NEXT: v_writelane_b32 v43, s9, 28
-; SI-NEXT: v_writelane_b32 v43, s81, 29
-; SI-NEXT: v_writelane_b32 v43, s80, 30
-; SI-NEXT: v_writelane_b32 v43, s7, 31
-; SI-NEXT: v_writelane_b32 v43, s72, 32
-; SI-NEXT: v_writelane_b32 v43, s26, 33
-; SI-NEXT: v_writelane_b32 v43, s41, 34
-; SI-NEXT: v_writelane_b32 v43, s14, 35
-; SI-NEXT: v_writelane_b32 v43, s69, 36
-; SI-NEXT: v_writelane_b32 v43, s71, 37
-; SI-NEXT: v_writelane_b32 v43, s70, 38
-; SI-NEXT: v_writelane_b32 v43, s68, 39
-; SI-NEXT: v_writelane_b32 v43, s60, 40
-; SI-NEXT: v_writelane_b32 v43, s62, 41
-; SI-NEXT: v_writelane_b32 v43, s11, 42
-; SI-NEXT: v_writelane_b32 v43, s10, 43
-; SI-NEXT: v_writelane_b32 v43, s58, 44
-; SI-NEXT: v_writelane_b32 v43, s66, 45
-; SI-NEXT: v_writelane_b32 v43, s29, 46
-; SI-NEXT: v_writelane_b32 v43, s28, 47
-; SI-NEXT: v_writelane_b32 v43, s27, 48
+; SI-NEXT: v_writelane_b32 v42, s67, 27
+; SI-NEXT: v_writelane_b32 v42, s84, 28
+; SI-NEXT: v_writelane_b32 v42, s85, 29
+; SI-NEXT: v_writelane_b32 v42, s86, 30
+; SI-NEXT: v_writelane_b32 v42, s87, 31
+; SI-NEXT: v_writelane_b32 v42, s8, 32
+; SI-NEXT: v_writelane_b32 v42, s99, 33
+; SI-NEXT: v_writelane_b32 v42, s12, 34
+; SI-NEXT: v_writelane_b32 v42, s44, 35
+; SI-NEXT: v_writelane_b32 v42, s97, 36
+; SI-NEXT: v_writelane_b32 v42, s15, 37
+; SI-NEXT: v_writelane_b32 v42, s96, 38
+; SI-NEXT: v_writelane_b32 v42, s98, 39
+; SI-NEXT: v_writelane_b32 v42, s83, 40
+; SI-NEXT: v_writelane_b32 v42, s82, 41
+; SI-NEXT: v_writelane_b32 v42, s9, 42
+; SI-NEXT: v_writelane_b32 v42, s81, 43
+; SI-NEXT: v_writelane_b32 v42, s80, 44
+; SI-NEXT: v_writelane_b32 v42, s7, 45
+; SI-NEXT: v_writelane_b32 v42, s72, 46
+; SI-NEXT: v_writelane_b32 v42, s75, 47
+; SI-NEXT: v_writelane_b32 v42, s41, 48
+; SI-NEXT: v_writelane_b32 v42, s14, 49
+; SI-NEXT: v_writelane_b32 v42, s69, 50
+; SI-NEXT: v_writelane_b32 v42, s71, 51
+; SI-NEXT: v_writelane_b32 v42, s70, 52
+; SI-NEXT: v_writelane_b32 v42, s68, 53
+; SI-NEXT: v_writelane_b32 v42, s59, 54
+; SI-NEXT: v_writelane_b32 v42, s63, 55
+; SI-NEXT: v_writelane_b32 v42, s11, 56
+; SI-NEXT: v_writelane_b32 v42, s10, 57
+; SI-NEXT: v_writelane_b32 v42, s13, 58
+; SI-NEXT: v_writelane_b32 v42, s66, 59
+; SI-NEXT: v_writelane_b32 v42, s56, 60
+; SI-NEXT: v_writelane_b32 v42, s28, 61
+; SI-NEXT: v_writelane_b32 v42, s27, 62
; SI-NEXT: s_cbranch_scc0 .LBB89_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_readlane_b32 s4, v44, 3
@@ -151674,107 +151692,97 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s5, 24
; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_writelane_b32 v43, s4, 58
+; SI-NEXT: v_writelane_b32 v43, s4, 10
; SI-NEXT: v_readlane_b32 s4, v44, 1
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: v_readlane_b32 s5, v44, 0
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s5, s5, 24
-; SI-NEXT: s_or_b32 s4, s5, s4
-; SI-NEXT: v_writelane_b32 v43, s4, 59
+; SI-NEXT: s_or_b32 s45, s5, s4
; SI-NEXT: s_and_b32 s4, s20, 0xff
-; SI-NEXT: s_lshl_b32 s5, s73, 8
-; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: s_lshl_b32 s5, s23, 8
+; SI-NEXT: s_or_b32 s24, s4, s5
; SI-NEXT: s_and_b32 s5, s22, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_mov_b32 s22, s6
-; SI-NEXT: s_lshl_b32 s6, s23, 24
-; SI-NEXT: v_writelane_b32 v43, s4, 60
-; SI-NEXT: s_or_b32 s4, s6, s5
-; SI-NEXT: s_and_b32 s5, s57, 0xff
+; SI-NEXT: s_lshl_b32 s6, s6, 24
+; SI-NEXT: s_or_b32 s42, s6, s5
+; SI-NEXT: s_and_b32 s5, s25, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s6, s25, 24
-; SI-NEXT: v_writelane_b32 v43, s4, 61
-; SI-NEXT: s_or_b32 s4, s6, s5
-; SI-NEXT: s_and_b32 s5, s47, 0xff
+; SI-NEXT: s_lshl_b32 s6, s40, 24
+; SI-NEXT: s_or_b32 s40, s6, s5
+; SI-NEXT: s_and_b32 s5, s58, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
-; SI-NEXT: s_lshl_b32 s6, s76, 24
-; SI-NEXT: v_writelane_b32 v43, s4, 62
-; SI-NEXT: s_or_b32 s4, s6, s5
-; SI-NEXT: s_and_b32 s5, s78, 0xff
-; SI-NEXT: s_lshl_b32 s6, s74, 8
+; SI-NEXT: s_lshl_b32 s6, s47, 24
+; SI-NEXT: s_or_b32 s73, s6, s5
+; SI-NEXT: s_and_b32 s5, s57, 0xff
+; SI-NEXT: s_lshl_b32 s6, s29, 8
; SI-NEXT: s_or_b32 s5, s5, s6
; SI-NEXT: s_and_b32 s6, s16, 0xff
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_lshl_b32 s16, s17, 24
-; SI-NEXT: v_writelane_b32 v43, s4, 63
-; SI-NEXT: s_or_b32 s4, s16, s6
+; SI-NEXT: s_or_b32 s25, s16, s6
; SI-NEXT: s_and_b32 s6, s89, 0xff
-; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_lshl_b32 s16, s77, 24
-; SI-NEXT: v_writelane_b32 v42, s4, 0
-; SI-NEXT: s_or_b32 s6, s16, s6
-; SI-NEXT: v_writelane_b32 v42, s6, 1
+; SI-NEXT: s_or_b32 s4, s16, s6
; SI-NEXT: s_and_b32 s6, s18, 0xff
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_lshl_b32 s16, s19, 24
-; SI-NEXT: s_or_b32 s76, s16, s6
+; SI-NEXT: s_or_b32 s23, s16, s6
; SI-NEXT: s_and_b32 s6, s93, 0xff
; SI-NEXT: s_lshl_b32 s16, s92, 8
; SI-NEXT: s_or_b32 s6, s6, s16
; SI-NEXT: s_and_b32 s16, s90, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_lshl_b32 s17, s91, 24
-; SI-NEXT: s_or_b32 s77, s17, s16
+; SI-NEXT: s_or_b32 s76, s17, s16
; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24
-; SI-NEXT: s_or_b32 s25, s17, s16
+; SI-NEXT: s_or_b32 s22, s17, s16
; SI-NEXT: s_and_b32 s16, s94, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_lshl_b32 s17, s95, 24
-; SI-NEXT: s_or_b32 s74, s17, s16
+; SI-NEXT: s_or_b32 s88, s17, s16
; SI-NEXT: s_and_b32 s16, s35, 0xff
; SI-NEXT: s_lshl_b32 s17, s34, 8
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: s_and_b32 s17, s30, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 16
; SI-NEXT: s_lshl_b32 s18, s31, 24
-; SI-NEXT: s_or_b32 s78, s18, s17
+; SI-NEXT: s_or_b32 s74, s18, s17
; SI-NEXT: s_and_b32 s17, s39, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 16
; SI-NEXT: s_lshl_b32 s18, s38, 24
-; SI-NEXT: s_mov_b32 s31, s88
-; SI-NEXT: s_or_b32 s88, s18, s17
+; SI-NEXT: s_or_b32 s77, s18, s17
; SI-NEXT: s_and_b32 s17, s36, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 16
; SI-NEXT: s_lshl_b32 s18, s37, 24
-; SI-NEXT: s_or_b32 s89, s18, s17
+; SI-NEXT: s_or_b32 s79, s18, s17
; SI-NEXT: s_and_b32 s17, s51, 0xff
; SI-NEXT: s_lshl_b32 s18, s50, 8
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_and_b32 s18, s48, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 16
; SI-NEXT: s_lshl_b32 s19, s49, 24
-; SI-NEXT: s_or_b32 s18, s19, s18
-; SI-NEXT: v_writelane_b32 v43, s18, 49
+; SI-NEXT: s_or_b32 s89, s19, s18
; SI-NEXT: s_and_b32 s18, s55, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 16
; SI-NEXT: s_lshl_b32 s19, s54, 24
-; SI-NEXT: s_mov_b32 s73, s79
-; SI-NEXT: s_or_b32 s79, s19, s18
+; SI-NEXT: s_or_b32 s78, s19, s18
; SI-NEXT: s_and_b32 s18, s52, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 16
; SI-NEXT: s_lshl_b32 s19, s53, 24
-; SI-NEXT: s_or_b32 s94, s19, s18
+; SI-NEXT: s_or_b32 s18, s19, s18
+; SI-NEXT: v_writelane_b32 v42, s18, 63
; SI-NEXT: s_and_b32 s18, s84, 0xff
; SI-NEXT: s_lshl_b32 s19, s67, 8
; SI-NEXT: s_or_b32 s18, s18, s19
; SI-NEXT: s_and_b32 s19, s64, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s65, 24
-; SI-NEXT: s_or_b32 s95, s20, s19
+; SI-NEXT: s_or_b32 s19, s20, s19
+; SI-NEXT: v_writelane_b32 v43, s19, 0
; SI-NEXT: s_and_b32 s19, s12, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s8, 24
@@ -151782,49 +151790,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s19, s85, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s86, 24
-; SI-NEXT: s_or_b32 s12, s20, s19
+; SI-NEXT: v_writelane_b32 v43, s8, 2
+; SI-NEXT: s_or_b32 s8, s20, s19
; SI-NEXT: s_and_b32 s19, s80, 0xff
; SI-NEXT: s_lshl_b32 s20, s9, 8
; SI-NEXT: s_or_b32 vcc_lo, s19, s20
; SI-NEXT: s_and_b32 s19, s44, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s97, 24
-; SI-NEXT: s_or_b32 s9, s20, s19
+; SI-NEXT: v_writelane_b32 v43, s8, 1
+; SI-NEXT: s_or_b32 s8, s20, s19
; SI-NEXT: s_and_b32 s19, s41, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s7, 24
; SI-NEXT: s_or_b32 s7, s20, s19
; SI-NEXT: s_and_b32 s19, s96, 0xff
+; SI-NEXT: v_writelane_b32 v43, s8, 3
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s15, 24
-; SI-NEXT: v_writelane_b32 v43, s12, 50
-; SI-NEXT: s_or_b32 s12, s20, s19
-; SI-NEXT: s_and_b32 s19, s26, 0xff
+; SI-NEXT: v_writelane_b32 v43, s7, 5
+; SI-NEXT: s_or_b32 s7, s20, s19
+; SI-NEXT: s_and_b32 s19, s75, 0xff
; SI-NEXT: s_lshl_b32 s20, s82, 8
; SI-NEXT: s_or_b32 vcc_hi, s19, s20
; SI-NEXT: s_and_b32 s19, s99, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s87, 24
-; SI-NEXT: v_writelane_b32 v43, s9, 51
-; SI-NEXT: s_or_b32 s9, s20, s19
+; SI-NEXT: v_writelane_b32 v43, s7, 4
+; SI-NEXT: s_or_b32 s7, s20, s19
; SI-NEXT: s_and_b32 s19, s72, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s81, 24
-; SI-NEXT: v_writelane_b32 v43, s9, 52
-; SI-NEXT: s_or_b32 s9, s20, s19
+; SI-NEXT: v_writelane_b32 v43, s7, 6
+; SI-NEXT: s_or_b32 s7, s20, s19
; SI-NEXT: s_and_b32 s19, s98, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s83, 24
-; SI-NEXT: v_writelane_b32 v43, s9, 54
-; SI-NEXT: s_or_b32 s9, s20, s19
-; SI-NEXT: s_and_b32 s19, s62, 0xff
-; SI-NEXT: s_lshl_b32 s20, s60, 8
+; SI-NEXT: s_or_b32 s60, s20, s19
+; SI-NEXT: s_and_b32 s19, s63, 0xff
+; SI-NEXT: s_lshl_b32 s20, s59, 8
; SI-NEXT: s_or_b32 s84, s19, s20
; SI-NEXT: s_and_b32 s19, s71, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s70, 24
-; SI-NEXT: v_writelane_b32 v43, s9, 53
-; SI-NEXT: s_or_b32 s9, s20, s19
+; SI-NEXT: v_writelane_b32 v43, s7, 7
+; SI-NEXT: s_or_b32 s7, s20, s19
; SI-NEXT: s_and_b32 s19, s11, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s68, 24
@@ -151832,185 +151842,185 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s19, s14, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s69, 24
-; SI-NEXT: v_writelane_b32 v43, s9, 55
-; SI-NEXT: s_or_b32 s9, s20, s19
-; SI-NEXT: s_and_b32 s19, s29, 0xff
+; SI-NEXT: s_or_b32 s58, s20, s19
+; SI-NEXT: s_and_b32 s19, s56, 0xff
; SI-NEXT: s_lshl_b32 s20, s66, 8
; SI-NEXT: s_or_b32 s85, s19, s20
; SI-NEXT: s_and_b32 s19, s10, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s58, 24
-; SI-NEXT: v_writelane_b32 v43, s9, 56
-; SI-NEXT: s_or_b32 s9, s20, s19
+; SI-NEXT: s_lshl_b32 s20, s13, 24
+; SI-NEXT: v_writelane_b32 v43, s7, 8
+; SI-NEXT: s_or_b32 s7, s20, s19
; SI-NEXT: s_and_b32 s19, s27, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_lshl_b32 s20, s28, 24
-; SI-NEXT: v_writelane_b32 v43, s9, 57
-; SI-NEXT: s_or_b32 s23, s20, s19
-; SI-NEXT: s_and_b32 s19, s24, 0xff
-; SI-NEXT: v_readlane_b32 s9, v44, 33
+; SI-NEXT: s_or_b32 s47, s20, s19
+; SI-NEXT: s_and_b32 s19, s62, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v44, 32
-; SI-NEXT: s_or_b32 s10, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v44, 31
-; SI-NEXT: s_lshl_b32 s20, s9, 8
-; SI-NEXT: v_readlane_b32 s9, v44, 30
+; SI-NEXT: s_lshl_b32 s20, s61, 24
+; SI-NEXT: s_or_b32 s56, s20, s19
+; SI-NEXT: s_and_b32 s19, s46, 0xff
+; SI-NEXT: s_lshl_b32 s20, s26, 8
+; SI-NEXT: v_writelane_b32 v43, s7, 9
; SI-NEXT: s_or_b32 s86, s19, s20
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v44, 29
+; SI-NEXT: s_and_b32 s19, s21, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 43
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s9, v44, 28
-; SI-NEXT: s_or_b32 s47, s20, s19
-; SI-NEXT: s_and_b32 s19, s9, 0xff
-; SI-NEXT: v_readlane_b32 s9, v44, 27
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 42
+; SI-NEXT: s_or_b32 s61, s20, s19
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 41
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s9, 24
-; SI-NEXT: v_readlane_b32 s11, v44, 26
-; SI-NEXT: s_or_b32 s9, s20, s19
-; SI-NEXT: s_and_b32 s19, s11, 0xff
-; SI-NEXT: v_readlane_b32 s11, v44, 25
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 40
+; SI-NEXT: s_or_b32 s46, s20, s19
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 39
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s11, 24
-; SI-NEXT: v_readlane_b32 s11, v44, 24
-; SI-NEXT: s_or_b32 s24, s20, s19
-; SI-NEXT: s_mov_b32 s92, s11
-; SI-NEXT: s_and_b32 s19, s11, 0xff
-; SI-NEXT: v_readlane_b32 s11, v44, 23
-; SI-NEXT: s_mov_b32 s36, s11
-; SI-NEXT: s_lshl_b32 s20, s11, 8
-; SI-NEXT: v_readlane_b32 s11, v44, 22
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 38
+; SI-NEXT: s_or_b32 s62, s20, s19
+; SI-NEXT: s_mov_b32 s92, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 37
+; SI-NEXT: s_mov_b32 s37, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 8
+; SI-NEXT: v_readlane_b32 s7, v44, 36
; SI-NEXT: s_or_b32 s87, s19, s20
-; SI-NEXT: s_mov_b32 s62, s11
-; SI-NEXT: s_and_b32 s19, s11, 0xff
-; SI-NEXT: v_readlane_b32 s11, v44, 21
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: s_mov_b32 s30, s7
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_mov_b32 s30, s11
-; SI-NEXT: s_lshl_b32 s20, s11, 24
-; SI-NEXT: v_readlane_b32 s11, v44, 20
-; SI-NEXT: s_or_b32 s58, s20, s19
-; SI-NEXT: s_mov_b32 s91, s11
-; SI-NEXT: s_and_b32 s19, s11, 0xff
-; SI-NEXT: v_readlane_b32 s11, v44, 19
+; SI-NEXT: s_lshl_b32 s20, s43, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 35
+; SI-NEXT: s_or_b32 s64, s20, s19
+; SI-NEXT: s_mov_b32 s91, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 34
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_mov_b32 s35, s11
-; SI-NEXT: s_lshl_b32 s20, s11, 24
-; SI-NEXT: v_readlane_b32 s11, v44, 18
-; SI-NEXT: s_mov_b32 s4, s46
-; SI-NEXT: s_or_b32 s46, s20, s19
-; SI-NEXT: s_and_b32 s19, s11, 0xff
+; SI-NEXT: s_mov_b32 s36, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 33
+; SI-NEXT: s_mov_b32 s39, s43
+; SI-NEXT: s_or_b32 s43, s20, s19
+; SI-NEXT: s_mov_b32 s53, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 32
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s73, 24
-; SI-NEXT: s_mov_b32 s52, s73
-; SI-NEXT: s_or_b32 s73, s20, s19
-; SI-NEXT: s_and_b32 s19, s31, 0xff
-; SI-NEXT: s_lshl_b32 s20, s45, 8
+; SI-NEXT: s_mov_b32 s49, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 31
+; SI-NEXT: s_or_b32 s65, s20, s19
+; SI-NEXT: s_mov_b32 s90, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 30
+; SI-NEXT: s_mov_b32 s54, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 8
+; SI-NEXT: v_readlane_b32 s7, v44, 29
; SI-NEXT: s_or_b32 s26, s19, s20
-; SI-NEXT: s_and_b32 s19, s13, 0xff
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 28
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s42, 24
+; SI-NEXT: s_mov_b32 s50, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 27
; SI-NEXT: s_or_b32 s67, s20, s19
-; SI-NEXT: s_and_b32 s19, s4, 0xff
+; SI-NEXT: s_mov_b32 s34, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 26
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s43, 24
-; SI-NEXT: s_mov_b32 s53, s42
-; SI-NEXT: s_or_b32 s42, s20, s19
-; SI-NEXT: s_and_b32 s19, s56, 0xff
+; SI-NEXT: s_mov_b32 s38, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 25
+; SI-NEXT: s_or_b32 s66, s20, s19
+; SI-NEXT: s_mov_b32 s48, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 24
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s59, 24
+; SI-NEXT: s_mov_b32 s59, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 23
; SI-NEXT: s_or_b32 s68, s20, s19
-; SI-NEXT: s_and_b32 s19, s63, 0xff
-; SI-NEXT: s_lshl_b32 s20, s61, 8
-; SI-NEXT: v_readlane_b32 s93, v44, 17
+; SI-NEXT: s_mov_b32 s63, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 22
+; SI-NEXT: s_mov_b32 s52, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 8
+; SI-NEXT: v_readlane_b32 s7, v44, 21
; SI-NEXT: s_or_b32 s27, s19, s20
-; SI-NEXT: s_and_b32 s19, s40, 0xff
+; SI-NEXT: s_mov_b32 s51, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 20
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s93, 24
+; SI-NEXT: s_mov_b32 s55, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 19
; SI-NEXT: s_or_b32 s70, s20, s19
-; SI-NEXT: s_and_b32 s19, s21, 0xff
-; SI-NEXT: s_mov_b32 s51, s59
-; SI-NEXT: s_mov_b32 s59, s7
+; SI-NEXT: s_mov_b32 s93, s7
+; SI-NEXT: s_and_b32 s19, s7, 0xff
+; SI-NEXT: v_readlane_b32 s7, v44, 18
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s75, 24
-; SI-NEXT: v_readlane_b32 s7, v44, 16
-; SI-NEXT: s_mov_b32 s48, s56
-; SI-NEXT: s_mov_b32 s56, s10
+; SI-NEXT: s_mov_b32 s75, s7
+; SI-NEXT: s_lshl_b32 s20, s7, 24
+; SI-NEXT: v_readlane_b32 s7, v44, 17
; SI-NEXT: s_or_b32 s69, s20, s19
; SI-NEXT: s_mov_b32 s10, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v44, 15
+; SI-NEXT: v_readlane_b32 s7, v44, 16
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_mov_b32 s71, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v44, 14
-; SI-NEXT: s_mov_b32 s39, s75
-; SI-NEXT: s_mov_b32 s75, s94
+; SI-NEXT: v_readlane_b32 s7, v44, 15
; SI-NEXT: s_or_b32 s94, s20, s19
; SI-NEXT: s_mov_b32 s41, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v44, 13
+; SI-NEXT: v_readlane_b32 s7, v44, 14
; SI-NEXT: s_mov_b32 s14, s7
; SI-NEXT: s_lshl_b32 s20, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v44, 12
+; SI-NEXT: v_readlane_b32 s7, v44, 13
; SI-NEXT: s_or_b32 s29, s19, s20
; SI-NEXT: s_mov_b32 s81, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v44, 11
-; SI-NEXT: s_mov_b32 s55, s45
-; SI-NEXT: s_mov_b32 s45, s9
+; SI-NEXT: v_readlane_b32 s7, v44, 12
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v44, 10
-; SI-NEXT: s_mov_b32 s38, s11
+; SI-NEXT: v_readlane_b32 s7, v44, 11
; SI-NEXT: s_or_b32 s11, s20, s19
; SI-NEXT: s_mov_b32 s72, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v44, 9
+; SI-NEXT: v_readlane_b32 s7, v44, 10
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_mov_b32 s82, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v44, 8
+; SI-NEXT: v_readlane_b32 s7, v44, 9
; SI-NEXT: s_or_b32 s80, s20, s19
; SI-NEXT: s_mov_b32 s83, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v44, 7
+; SI-NEXT: v_readlane_b32 s7, v44, 8
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_mov_b32 s96, s7
; SI-NEXT: s_lshl_b32 s20, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v44, 6
-; SI-NEXT: s_mov_b32 s90, s31
+; SI-NEXT: v_readlane_b32 s7, v44, 7
; SI-NEXT: s_or_b32 s31, s20, s19
; SI-NEXT: s_mov_b32 s98, s7
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v44, 5
+; SI-NEXT: v_readlane_b32 s7, v44, 6
; SI-NEXT: s_mov_b32 s44, s7
; SI-NEXT: s_lshl_b32 s20, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v44, 4
-; SI-NEXT: s_mov_b32 s37, s43
-; SI-NEXT: s_mov_b32 s43, s93
-; SI-NEXT: s_mov_b32 s93, s21
+; SI-NEXT: v_readlane_b32 s7, v44, 5
; SI-NEXT: s_or_b32 s21, s19, s20
; SI-NEXT: s_and_b32 s19, s7, 0xff
-; SI-NEXT: s_mov_b32 s34, s4
+; SI-NEXT: v_readlane_b32 s8, v44, 4
; SI-NEXT: s_lshl_b32 s19, s19, 16
-; SI-NEXT: s_lshl_b32 s20, s22, 24
-; SI-NEXT: v_readlane_b32 s4, v43, 60
-; SI-NEXT: s_mov_b32 s54, s13
-; SI-NEXT: s_mov_b32 s13, s12
-; SI-NEXT: s_mov_b32 s50, s63
-; SI-NEXT: s_mov_b32 s63, s95
-; SI-NEXT: s_mov_b32 s49, s61
-; SI-NEXT: s_mov_b32 s61, s8
-; SI-NEXT: s_mov_b32 s60, s40
+; SI-NEXT: s_lshl_b32 s20, s8, 24
; SI-NEXT: s_mov_b32 s12, s7
-; SI-NEXT: s_mov_b32 s7, s22
+; SI-NEXT: s_mov_b32 s7, s8
; SI-NEXT: s_or_b32 s15, s20, s19
-; SI-NEXT: s_lshl_b32 s20, s4, 16
-; SI-NEXT: s_lshl_b32 s95, s5, 16
-; SI-NEXT: s_lshl_b32 s22, s6, 16
+; SI-NEXT: s_lshl_b32 s20, s24, 16
+; SI-NEXT: s_lshl_b32 s35, s5, 16
+; SI-NEXT: s_lshl_b32 s95, s6, 16
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_lshl_b32 s19, s17, 16
; SI-NEXT: s_lshl_b32 s18, s18, 16
@@ -152021,16 +152031,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_lshl_b32 s97, s86, 16
; SI-NEXT: s_lshl_b32 s28, s87, 16
; SI-NEXT: s_lshl_b32 s87, s26, 16
-; SI-NEXT: v_readlane_b32 s26, v43, 58
+; SI-NEXT: v_readlane_b32 s26, v43, 10
; SI-NEXT: s_lshl_b32 s86, s27, 16
-; SI-NEXT: v_readlane_b32 s27, v43, 59
-; SI-NEXT: v_readlane_b32 s66, v43, 63
+; SI-NEXT: s_mov_b32 s27, s45
; SI-NEXT: s_lshl_b32 s85, s29, 16
-; SI-NEXT: v_readlane_b32 s29, v43, 62
-; SI-NEXT: v_readlane_b32 s65, v43, 61
-; SI-NEXT: v_readlane_b32 s64, v42, 0
+; SI-NEXT: s_mov_b32 s29, s40
+; SI-NEXT: s_mov_b32 s24, s42
; SI-NEXT: s_lshl_b32 s84, s21, 16
-; SI-NEXT: v_readlane_b32 s21, v42, 1
+; SI-NEXT: s_mov_b32 s21, s4
; SI-NEXT: s_cbranch_execnz .LBB89_3
; SI-NEXT: .LBB89_2: ; %cmp.true
; SI-NEXT: s_add_i32 s4, s98, 3
@@ -152071,7 +152079,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s6, s16, s6
; SI-NEXT: s_add_i32 s16, s93, 3
; SI-NEXT: s_and_b32 s16, s16, 0xff
-; SI-NEXT: s_lshl_b32 s17, s39, 8
+; SI-NEXT: s_lshl_b32 s17, s75, 8
; SI-NEXT: s_add_i32 s18, s10, 3
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_and_b32 s18, s18, 0xff
@@ -152081,13 +152089,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_and_b32 s16, s16, 0xffff
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: s_add_i32 s17, s50, 3
+; SI-NEXT: s_add_i32 s17, s63, 3
; SI-NEXT: s_and_b32 s17, s17, 0xff
-; SI-NEXT: s_lshl_b32 s18, s49, 8
-; SI-NEXT: s_add_i32 s19, s60, 3
+; SI-NEXT: s_lshl_b32 s18, s52, 8
+; SI-NEXT: s_add_i32 s19, s51, 3
; SI-NEXT: s_or_b32 s17, s18, s17
; SI-NEXT: s_and_b32 s19, s19, 0xff
-; SI-NEXT: s_lshl_b32 s18, s43, 24
+; SI-NEXT: s_lshl_b32 s18, s55, 24
; SI-NEXT: s_lshl_b32 s19, s19, 16
; SI-NEXT: s_addk_i32 s17, 0x300
; SI-NEXT: s_or_b32 s18, s18, s19
@@ -152095,11 +152103,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s17, s18, s17
; SI-NEXT: s_add_i32 s18, s34, 3
; SI-NEXT: s_and_b32 s18, s18, 0xff
-; SI-NEXT: s_lshl_b32 s19, s37, 8
+; SI-NEXT: s_lshl_b32 s19, s38, 8
; SI-NEXT: s_add_i32 s20, s48, 3
; SI-NEXT: s_or_b32 s18, s19, s18
; SI-NEXT: s_and_b32 s20, s20, 0xff
-; SI-NEXT: s_lshl_b32 s19, s51, 24
+; SI-NEXT: s_lshl_b32 s19, s59, 24
; SI-NEXT: s_lshl_b32 s20, s20, 16
; SI-NEXT: s_addk_i32 s18, 0x300
; SI-NEXT: s_or_b32 s19, s19, s20
@@ -152107,11 +152115,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s18, s19, s18
; SI-NEXT: s_add_i32 s19, s90, 3
; SI-NEXT: s_and_b32 s19, s19, 0xff
-; SI-NEXT: s_lshl_b32 s20, s55, 8
-; SI-NEXT: s_add_i32 s22, s54, 3
+; SI-NEXT: s_lshl_b32 s20, s54, 8
+; SI-NEXT: s_add_i32 s22, s13, 3
; SI-NEXT: s_or_b32 s19, s20, s19
; SI-NEXT: s_and_b32 s22, s22, 0xff
-; SI-NEXT: s_lshl_b32 s20, s53, 24
+; SI-NEXT: s_lshl_b32 s20, s50, 24
; SI-NEXT: s_lshl_b32 s22, s22, 16
; SI-NEXT: s_addk_i32 s19, 0x300
; SI-NEXT: s_or_b32 s20, s20, s22
@@ -152119,11 +152127,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s19, s20, s19
; SI-NEXT: s_add_i32 s20, s91, 3
; SI-NEXT: s_and_b32 s20, s20, 0xff
-; SI-NEXT: s_lshl_b32 s22, s35, 8
-; SI-NEXT: s_add_i32 s23, s38, 3
+; SI-NEXT: s_lshl_b32 s22, s36, 8
+; SI-NEXT: s_add_i32 s23, s53, 3
; SI-NEXT: s_or_b32 s20, s22, s20
; SI-NEXT: s_and_b32 s23, s23, 0xff
-; SI-NEXT: s_lshl_b32 s22, s52, 24
+; SI-NEXT: s_lshl_b32 s22, s49, 24
; SI-NEXT: s_lshl_b32 s23, s23, 16
; SI-NEXT: s_addk_i32 s20, 0x300
; SI-NEXT: s_or_b32 s22, s22, s23
@@ -152131,93 +152139,93 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s20, s22, s20
; SI-NEXT: s_add_i32 s22, s92, 3
; SI-NEXT: s_and_b32 s22, s22, 0xff
-; SI-NEXT: s_lshl_b32 s23, s36, 8
-; SI-NEXT: s_add_i32 s60, s62, 3
+; SI-NEXT: s_lshl_b32 s23, s37, 8
+; SI-NEXT: s_add_i32 s60, s30, 3
; SI-NEXT: s_or_b32 s22, s23, s22
; SI-NEXT: s_and_b32 s60, s60, 0xff
-; SI-NEXT: s_lshl_b32 s23, s30, 24
+; SI-NEXT: s_lshl_b32 s23, s39, 24
; SI-NEXT: s_lshl_b32 s60, s60, 16
; SI-NEXT: s_addk_i32 s22, 0x300
; SI-NEXT: s_or_b32 s23, s23, s60
; SI-NEXT: s_and_b32 s22, s22, 0xffff
-; SI-NEXT: v_readlane_b32 s7, v44, 28
+; SI-NEXT: v_readlane_b32 s7, v44, 42
; SI-NEXT: s_or_b32 s22, s23, s22
; SI-NEXT: s_add_i32 s23, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v44, 27
+; SI-NEXT: v_readlane_b32 s7, v44, 41
; SI-NEXT: s_and_b32 s23, s23, 0xff
; SI-NEXT: s_lshl_b32 s60, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v44, 25
+; SI-NEXT: v_readlane_b32 s7, v44, 39
; SI-NEXT: s_or_b32 s23, s60, s23
; SI-NEXT: s_lshl_b32 s60, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v44, 26
+; SI-NEXT: v_readlane_b32 s7, v44, 40
; SI-NEXT: s_add_i32 s61, s7, 3
; SI-NEXT: s_and_b32 s61, s61, 0xff
; SI-NEXT: s_lshl_b32 s61, s61, 16
; SI-NEXT: s_addk_i32 s23, 0x300
; SI-NEXT: s_or_b32 s60, s60, s61
; SI-NEXT: s_and_b32 s23, s23, 0xffff
-; SI-NEXT: v_readlane_b32 s7, v44, 32
+; SI-NEXT: v_readlane_b32 s7, v44, 58
; SI-NEXT: s_or_b32 s23, s60, s23
; SI-NEXT: s_add_i32 s60, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v44, 31
+; SI-NEXT: v_readlane_b32 s7, v44, 57
; SI-NEXT: s_and_b32 s60, s60, 0xff
; SI-NEXT: s_lshl_b32 s61, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v44, 29
+; SI-NEXT: v_readlane_b32 s7, v44, 43
; SI-NEXT: s_or_b32 s60, s61, s60
; SI-NEXT: s_lshl_b32 s61, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v44, 30
+; SI-NEXT: v_readlane_b32 s7, v44, 56
; SI-NEXT: s_add_i32 s62, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 48
+; SI-NEXT: v_readlane_b32 s7, v42, 62
; SI-NEXT: s_and_b32 s62, s62, 0xff
; SI-NEXT: s_add_i32 s59, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 47
+; SI-NEXT: v_readlane_b32 s7, v42, 61
; SI-NEXT: s_lshl_b32 s62, s62, 16
; SI-NEXT: s_addk_i32 s60, 0x300
; SI-NEXT: s_and_b32 s59, s59, 0xff
; SI-NEXT: s_lshl_b32 s58, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v44, 33
+; SI-NEXT: v_readlane_b32 s7, v44, 55
; SI-NEXT: s_or_b32 s61, s61, s62
; SI-NEXT: s_and_b32 s60, s60, 0xffff
; SI-NEXT: s_or_b32 s58, s58, s59
; SI-NEXT: s_lshl_b32 s59, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v44, 44
+; SI-NEXT: v_readlane_b32 s7, v44, 54
; SI-NEXT: s_or_b32 s60, s61, s60
; SI-NEXT: s_add_i32 s61, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 46
+; SI-NEXT: v_readlane_b32 s7, v42, 60
; SI-NEXT: s_add_i32 s57, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 45
+; SI-NEXT: v_readlane_b32 s7, v42, 59
; SI-NEXT: s_lshl_b32 s56, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 44
+; SI-NEXT: v_readlane_b32 s7, v42, 58
; SI-NEXT: s_lshl_b32 s47, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 43
+; SI-NEXT: v_readlane_b32 s7, v42, 57
; SI-NEXT: s_add_i32 s46, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 42
+; SI-NEXT: v_readlane_b32 s7, v42, 56
; SI-NEXT: s_add_i32 s45, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 39
+; SI-NEXT: v_readlane_b32 s7, v42, 53
; SI-NEXT: s_lshl_b32 s42, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 36
+; SI-NEXT: v_readlane_b32 s7, v42, 50
; SI-NEXT: s_lshl_b32 s15, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 35
+; SI-NEXT: v_readlane_b32 s7, v42, 49
; SI-NEXT: s_and_b32 s45, s45, 0xff
; SI-NEXT: s_add_i32 s14, s7, 3
; SI-NEXT: s_or_b32 s42, s42, s45
; SI-NEXT: s_and_b32 s14, s14, 0xff
; SI-NEXT: s_lshl_b32 s14, s14, 16
; SI-NEXT: s_addk_i32 s42, 0x300
-; SI-NEXT: v_readlane_b32 s7, v43, 41
+; SI-NEXT: v_readlane_b32 s7, v42, 55
; SI-NEXT: s_and_b32 s57, s57, 0xff
; SI-NEXT: s_or_b32 s14, s15, s14
; SI-NEXT: s_and_b32 s15, s42, 0xffff
; SI-NEXT: s_add_i32 s44, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 40
+; SI-NEXT: v_readlane_b32 s7, v42, 54
; SI-NEXT: s_or_b32 s56, s56, s57
; SI-NEXT: s_or_b32 s57, s14, s15
; SI-NEXT: s_and_b32 s14, s44, 0xff
; SI-NEXT: s_lshl_b32 s15, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 38
+; SI-NEXT: v_readlane_b32 s7, v42, 52
; SI-NEXT: s_or_b32 s14, s15, s14
; SI-NEXT: s_lshl_b32 s15, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 37
+; SI-NEXT: v_readlane_b32 s7, v42, 51
; SI-NEXT: s_add_i32 s40, s7, 3
; SI-NEXT: s_and_b32 s61, s61, 0xff
; SI-NEXT: s_and_b32 s40, s40, 0xff
@@ -152232,15 +152240,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s58, s59, s58
; SI-NEXT: s_or_b32 s59, s15, s14
; SI-NEXT: s_add_i32 s14, s6, 0x3000000
-; SI-NEXT: v_readlane_b32 s6, v43, 32
+; SI-NEXT: v_readlane_b32 s6, v42, 46
; SI-NEXT: s_add_i32 s11, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 29
+; SI-NEXT: v_readlane_b32 s7, v42, 43
; SI-NEXT: s_and_b32 s6, s11, 0xff
; SI-NEXT: s_lshl_b32 s8, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 26
+; SI-NEXT: v_readlane_b32 s7, v42, 40
; SI-NEXT: s_or_b32 s6, s8, s6
; SI-NEXT: s_lshl_b32 s8, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 25
+; SI-NEXT: v_readlane_b32 s7, v42, 39
; SI-NEXT: s_add_i32 s24, s7, 3
; SI-NEXT: s_and_b32 s11, s24, 0xff
; SI-NEXT: s_addk_i32 s6, 0x300
@@ -152248,47 +152256,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s6, s6, 0xffff
; SI-NEXT: s_or_b32 s8, s8, s11
; SI-NEXT: s_or_b32 s8, s8, s6
-; SI-NEXT: v_readlane_b32 s6, v43, 33
+; SI-NEXT: v_readlane_b32 s6, v42, 47
; SI-NEXT: s_add_i32 s12, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 27
+; SI-NEXT: v_readlane_b32 s7, v42, 41
; SI-NEXT: s_and_b32 s6, s12, 0xff
; SI-NEXT: s_lshl_b32 s11, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 17
+; SI-NEXT: v_readlane_b32 s7, v42, 31
; SI-NEXT: s_or_b32 s6, s11, s6
; SI-NEXT: s_lshl_b32 s11, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 19
+; SI-NEXT: v_readlane_b32 s7, v42, 33
; SI-NEXT: s_add_i32 s12, s7, 3
; SI-NEXT: s_and_b32 s12, s12, 0xff
; SI-NEXT: s_addk_i32 s6, 0x300
; SI-NEXT: s_lshl_b32 s12, s12, 16
-; SI-NEXT: v_readlane_b32 s7, v43, 34
+; SI-NEXT: v_readlane_b32 s7, v42, 48
; SI-NEXT: s_and_b32 s6, s6, 0xffff
; SI-NEXT: s_or_b32 s11, s11, s12
; SI-NEXT: s_add_i32 s13, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 31
+; SI-NEXT: v_readlane_b32 s7, v42, 45
; SI-NEXT: s_or_b32 s6, s11, s6
; SI-NEXT: s_and_b32 s11, s13, 0xff
; SI-NEXT: s_lshl_b32 s10, s7, 8
-; SI-NEXT: v_readlane_b32 s7, v43, 23
+; SI-NEXT: v_readlane_b32 s7, v42, 37
; SI-NEXT: s_or_b32 s10, s10, s11
; SI-NEXT: s_lshl_b32 s11, s7, 24
-; SI-NEXT: v_readlane_b32 s7, v43, 24
+; SI-NEXT: v_readlane_b32 s7, v42, 38
; SI-NEXT: s_add_i32 s25, s7, 3
; SI-NEXT: s_and_b32 s12, s25, 0xff
; SI-NEXT: s_addk_i32 s10, 0x300
; SI-NEXT: s_lshl_b32 s12, s12, 16
; SI-NEXT: s_and_b32 s10, s10, 0xffff
; SI-NEXT: s_or_b32 s11, s11, s12
-; SI-NEXT: v_readlane_b32 s7, v43, 30
+; SI-NEXT: v_readlane_b32 s7, v42, 44
; SI-NEXT: s_or_b32 s10, s11, s10
; SI-NEXT: s_add_i32 s9, s7, 3
-; SI-NEXT: v_readlane_b32 s7, v43, 28
-; SI-NEXT: v_readlane_b32 s11, v43, 21
+; SI-NEXT: v_readlane_b32 s7, v42, 42
+; SI-NEXT: v_readlane_b32 s11, v42, 35
; SI-NEXT: s_and_b32 s9, s9, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
; SI-NEXT: s_add_i32 s11, s11, 3
; SI-NEXT: s_or_b32 s7, s7, s9
-; SI-NEXT: v_readlane_b32 s9, v43, 22
+; SI-NEXT: v_readlane_b32 s9, v42, 36
; SI-NEXT: s_and_b32 s11, s11, 0xff
; SI-NEXT: s_addk_i32 s7, 0x300
; SI-NEXT: s_lshl_b32 s9, s9, 24
@@ -152296,15 +152304,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s7, s7, 0xffff
; SI-NEXT: s_or_b32 s9, s9, s11
; SI-NEXT: s_or_b32 s7, s9, s7
-; SI-NEXT: v_readlane_b32 s9, v43, 20
+; SI-NEXT: v_readlane_b32 s9, v42, 34
; SI-NEXT: s_add_i32 s21, s9, 3
-; SI-NEXT: v_readlane_b32 s11, v43, 18
-; SI-NEXT: v_readlane_b32 s12, v43, 15
+; SI-NEXT: v_readlane_b32 s11, v42, 32
+; SI-NEXT: v_readlane_b32 s12, v42, 29
; SI-NEXT: s_and_b32 s9, s21, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 8
; SI-NEXT: s_add_i32 s12, s12, 3
; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: v_readlane_b32 s11, v43, 16
+; SI-NEXT: v_readlane_b32 s11, v42, 30
; SI-NEXT: s_and_b32 s12, s12, 0xff
; SI-NEXT: s_addk_i32 s9, 0x300
; SI-NEXT: s_lshl_b32 s11, s11, 24
@@ -152312,15 +152320,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s9, s9, 0xffff
; SI-NEXT: s_or_b32 s11, s11, s12
; SI-NEXT: s_or_b32 s9, s11, s9
-; SI-NEXT: v_readlane_b32 s11, v43, 14
+; SI-NEXT: v_readlane_b32 s11, v42, 28
; SI-NEXT: s_add_i32 s11, s11, 3
-; SI-NEXT: v_readlane_b32 s12, v43, 13
-; SI-NEXT: v_readlane_b32 s13, v43, 11
+; SI-NEXT: v_readlane_b32 s12, v42, 27
+; SI-NEXT: v_readlane_b32 s13, v42, 25
; SI-NEXT: s_and_b32 s11, s11, 0xff
; SI-NEXT: s_lshl_b32 s12, s12, 8
; SI-NEXT: s_add_i32 s13, s13, 3
; SI-NEXT: s_or_b32 s11, s12, s11
-; SI-NEXT: v_readlane_b32 s12, v43, 12
+; SI-NEXT: v_readlane_b32 s12, v42, 26
; SI-NEXT: s_and_b32 s13, s13, 0xff
; SI-NEXT: s_addk_i32 s11, 0x300
; SI-NEXT: s_lshl_b32 s12, s12, 24
@@ -152328,16 +152336,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s11, s11, 0xffff
; SI-NEXT: s_or_b32 s12, s12, s13
; SI-NEXT: s_or_b32 s11, s12, s11
-; SI-NEXT: v_readlane_b32 s12, v43, 10
+; SI-NEXT: v_readlane_b32 s12, v42, 24
; SI-NEXT: s_add_i32 s15, s16, 0x3000000
; SI-NEXT: s_add_i32 s12, s12, 3
-; SI-NEXT: v_readlane_b32 s13, v43, 9
-; SI-NEXT: v_readlane_b32 s16, v43, 7
+; SI-NEXT: v_readlane_b32 s13, v42, 23
+; SI-NEXT: v_readlane_b32 s16, v42, 21
; SI-NEXT: s_and_b32 s12, s12, 0xff
; SI-NEXT: s_lshl_b32 s13, s13, 8
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: s_or_b32 s12, s13, s12
-; SI-NEXT: v_readlane_b32 s13, v43, 8
+; SI-NEXT: v_readlane_b32 s13, v42, 22
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_addk_i32 s12, 0x300
; SI-NEXT: s_lshl_b32 s13, s13, 24
@@ -152345,16 +152353,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s12, s12, 0xffff
; SI-NEXT: s_or_b32 s13, s13, s16
; SI-NEXT: s_or_b32 s12, s13, s12
-; SI-NEXT: v_readlane_b32 s13, v43, 6
+; SI-NEXT: v_readlane_b32 s13, v42, 20
; SI-NEXT: s_add_i32 s40, s17, 0x3000000
; SI-NEXT: s_add_i32 s13, s13, 3
-; SI-NEXT: v_readlane_b32 s16, v43, 5
-; SI-NEXT: v_readlane_b32 s17, v43, 3
+; SI-NEXT: v_readlane_b32 s16, v42, 19
+; SI-NEXT: v_readlane_b32 s17, v42, 17
; SI-NEXT: s_and_b32 s13, s13, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 8
; SI-NEXT: s_add_i32 s17, s17, 3
; SI-NEXT: s_or_b32 s13, s16, s13
-; SI-NEXT: v_readlane_b32 s16, v43, 4
+; SI-NEXT: v_readlane_b32 s16, v42, 18
; SI-NEXT: s_and_b32 s17, s17, 0xff
; SI-NEXT: s_addk_i32 s13, 0x300
; SI-NEXT: s_lshl_b32 s16, s16, 24
@@ -152362,16 +152370,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s13, s13, 0xffff
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: s_or_b32 s13, s16, s13
-; SI-NEXT: v_readlane_b32 s16, v43, 2
+; SI-NEXT: v_readlane_b32 s16, v42, 16
; SI-NEXT: s_add_i32 s41, s18, 0x3000000
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v43, 1
-; SI-NEXT: v_readlane_b32 s18, v44, 63
+; SI-NEXT: v_readlane_b32 s17, v42, 15
+; SI-NEXT: v_readlane_b32 s18, v42, 13
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v43, 0
+; SI-NEXT: v_readlane_b32 s17, v42, 14
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -152380,16 +152388,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s17, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v44, 62
+; SI-NEXT: v_readlane_b32 s16, v42, 12
; SI-NEXT: s_add_i32 s42, s19, 0x3000000
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s18, v44, 61
-; SI-NEXT: v_readlane_b32 s19, v44, 59
+; SI-NEXT: v_readlane_b32 s18, v42, 11
+; SI-NEXT: v_readlane_b32 s19, v42, 9
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 8
; SI-NEXT: s_add_i32 s19, s19, 3
; SI-NEXT: s_or_b32 s16, s18, s16
-; SI-NEXT: v_readlane_b32 s18, v44, 60
+; SI-NEXT: v_readlane_b32 s18, v42, 10
; SI-NEXT: s_and_b32 s19, s19, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s18, s18, 24
@@ -152397,16 +152405,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s16, s16, 0xffff
; SI-NEXT: s_or_b32 s18, s18, s19
; SI-NEXT: s_or_b32 s16, s18, s16
-; SI-NEXT: v_readlane_b32 s18, v44, 58
+; SI-NEXT: v_readlane_b32 s18, v42, 8
; SI-NEXT: s_add_i32 s43, s20, 0x3000000
; SI-NEXT: s_add_i32 s18, s18, 3
-; SI-NEXT: v_readlane_b32 s19, v44, 57
-; SI-NEXT: v_readlane_b32 s20, v44, 55
+; SI-NEXT: v_readlane_b32 s19, v42, 7
+; SI-NEXT: v_readlane_b32 s20, v42, 5
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_lshl_b32 s19, s19, 8
; SI-NEXT: s_add_i32 s20, s20, 3
; SI-NEXT: s_or_b32 s18, s19, s18
-; SI-NEXT: v_readlane_b32 s19, v44, 56
+; SI-NEXT: v_readlane_b32 s19, v42, 6
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_addk_i32 s18, 0x300
; SI-NEXT: s_lshl_b32 s19, s19, 24
@@ -152414,15 +152422,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s18, s18, 0xffff
; SI-NEXT: s_or_b32 s19, s19, s20
; SI-NEXT: s_or_b32 s18, s19, s18
-; SI-NEXT: v_readlane_b32 s19, v44, 54
+; SI-NEXT: v_readlane_b32 s19, v42, 4
; SI-NEXT: s_add_i32 s19, s19, 3
-; SI-NEXT: v_readlane_b32 s20, v44, 53
-; SI-NEXT: v_readlane_b32 s21, v44, 51
+; SI-NEXT: v_readlane_b32 s20, v42, 3
+; SI-NEXT: v_readlane_b32 s21, v42, 1
; SI-NEXT: s_and_b32 s19, s19, 0xff
; SI-NEXT: s_lshl_b32 s20, s20, 8
; SI-NEXT: s_add_i32 s21, s21, 3
; SI-NEXT: s_or_b32 s19, s20, s19
-; SI-NEXT: v_readlane_b32 s20, v44, 52
+; SI-NEXT: v_readlane_b32 s20, v42, 2
; SI-NEXT: s_and_b32 s21, s21, 0xff
; SI-NEXT: s_addk_i32 s19, 0x300
; SI-NEXT: s_lshl_b32 s20, s20, 24
@@ -152430,16 +152438,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_and_b32 s19, s19, 0xffff
; SI-NEXT: s_or_b32 s20, s20, s21
; SI-NEXT: s_or_b32 s19, s20, s19
-; SI-NEXT: v_readlane_b32 s20, v44, 50
+; SI-NEXT: v_readlane_b32 s20, v42, 0
; SI-NEXT: s_add_i32 s44, s22, 0x3000000
; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: v_readlane_b32 s21, v44, 49
-; SI-NEXT: v_readlane_b32 s22, v44, 47
+; SI-NEXT: v_readlane_b32 s21, v44, 63
+; SI-NEXT: v_readlane_b32 s22, v44, 61
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_lshl_b32 s21, s21, 8
; SI-NEXT: s_add_i32 s22, s22, 3
; SI-NEXT: s_or_b32 s20, s21, s20
-; SI-NEXT: v_readlane_b32 s21, v44, 48
+; SI-NEXT: v_readlane_b32 s21, v44, 62
; SI-NEXT: s_and_b32 s22, s22, 0xff
; SI-NEXT: s_addk_i32 s20, 0x300
; SI-NEXT: s_lshl_b32 s21, s21, 24
@@ -152448,16 +152456,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s21, s21, s22
; SI-NEXT: s_or_b32 s20, s21, s20
; SI-NEXT: s_add_i32 s21, s20, 0x3000000
-; SI-NEXT: v_readlane_b32 s20, v44, 43
+; SI-NEXT: v_readlane_b32 s20, v44, 53
; SI-NEXT: s_add_i32 s45, s23, 0x3000000
; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: v_readlane_b32 s22, v44, 42
-; SI-NEXT: v_readlane_b32 s23, v44, 45
+; SI-NEXT: v_readlane_b32 s22, v44, 52
+; SI-NEXT: v_readlane_b32 s23, v44, 59
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_lshl_b32 s22, s22, 8
; SI-NEXT: s_add_i32 s23, s23, 3
; SI-NEXT: s_or_b32 s20, s22, s20
-; SI-NEXT: v_readlane_b32 s22, v44, 46
+; SI-NEXT: v_readlane_b32 s22, v44, 60
; SI-NEXT: s_and_b32 s23, s23, 0xff
; SI-NEXT: s_addk_i32 s20, 0x300
; SI-NEXT: s_lshl_b32 s22, s22, 24
@@ -152466,15 +152474,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s22, s22, s23
; SI-NEXT: s_or_b32 s20, s22, s20
; SI-NEXT: s_add_i32 s22, s20, 0x3000000
-; SI-NEXT: v_readlane_b32 s20, v44, 41
+; SI-NEXT: v_readlane_b32 s20, v44, 51
; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: v_readlane_b32 s23, v44, 40
-; SI-NEXT: v_readlane_b32 s24, v44, 38
+; SI-NEXT: v_readlane_b32 s23, v44, 50
+; SI-NEXT: v_readlane_b32 s24, v44, 48
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_lshl_b32 s23, s23, 8
; SI-NEXT: s_add_i32 s24, s24, 3
; SI-NEXT: s_or_b32 s20, s23, s20
-; SI-NEXT: v_readlane_b32 s23, v44, 39
+; SI-NEXT: v_readlane_b32 s23, v44, 49
; SI-NEXT: s_and_b32 s24, s24, 0xff
; SI-NEXT: s_addk_i32 s20, 0x300
; SI-NEXT: s_lshl_b32 s23, s23, 24
@@ -152483,15 +152491,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s23, s23, s24
; SI-NEXT: s_or_b32 s20, s23, s20
; SI-NEXT: s_add_i32 s23, s20, 0x3000000
-; SI-NEXT: v_readlane_b32 s20, v44, 37
+; SI-NEXT: v_readlane_b32 s20, v44, 47
; SI-NEXT: s_add_i32 s20, s20, 3
-; SI-NEXT: v_readlane_b32 s24, v44, 36
-; SI-NEXT: v_readlane_b32 s25, v44, 34
+; SI-NEXT: v_readlane_b32 s24, v44, 46
+; SI-NEXT: v_readlane_b32 s25, v44, 44
; SI-NEXT: s_and_b32 s20, s20, 0xff
; SI-NEXT: s_lshl_b32 s24, s24, 8
; SI-NEXT: s_add_i32 s25, s25, 3
; SI-NEXT: s_or_b32 s20, s24, s20
-; SI-NEXT: v_readlane_b32 s24, v44, 35
+; SI-NEXT: v_readlane_b32 s24, v44, 45
; SI-NEXT: s_and_b32 s25, s25, 0xff
; SI-NEXT: s_addk_i32 s20, 0x300
; SI-NEXT: s_lshl_b32 s24, s24, 24
@@ -152509,100 +152517,101 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_or_b32 s24, s25, s24
; SI-NEXT: v_readlane_b32 s25, v44, 0
; SI-NEXT: s_and_b32 s26, s26, 0xff
+; SI-NEXT: s_add_i32 s11, s11, 0x3000000
+; SI-NEXT: s_add_i32 s12, s12, 0x3000000
; SI-NEXT: s_add_i32 s13, s13, 0x3000000
+; SI-NEXT: s_add_i32 s19, s19, 0x3000000
; SI-NEXT: s_addk_i32 s24, 0x300
; SI-NEXT: s_lshl_b32 s25, s25, 24
; SI-NEXT: s_lshl_b32 s26, s26, 16
; SI-NEXT: s_add_i32 s9, s9, 0x3000000
-; SI-NEXT: s_add_i32 s11, s11, 0x3000000
; SI-NEXT: s_add_i32 s18, s18, 0x3000000
; SI-NEXT: s_and_b32 s24, s24, 0xffff
; SI-NEXT: s_or_b32 s25, s25, s26
-; SI-NEXT: s_and_b32 s89, s17, 0xffff0000
-; SI-NEXT: s_lshl_b32 s88, s17, 16
-; SI-NEXT: s_and_b32 s17, s13, 0xffff0000
-; SI-NEXT: s_add_i32 s7, s7, 0x3000000
+; SI-NEXT: s_and_b32 s76, s19, 0xffff0000
+; SI-NEXT: s_lshl_b32 s95, s19, 16
+; SI-NEXT: s_and_b32 s89, s13, 0xffff0000
+; SI-NEXT: s_lshl_b32 s19, s13, 16
+; SI-NEXT: s_and_b32 s13, s12, 0xffff0000
+; SI-NEXT: s_lshl_b32 s78, s12, 16
+; SI-NEXT: s_and_b32 s12, s11, 0xffff0000
; SI-NEXT: s_or_b32 s24, s25, s24
-; SI-NEXT: s_and_b32 s74, s18, 0xffff0000
-; SI-NEXT: s_lshl_b32 s25, s18, 16
-; SI-NEXT: v_writelane_b32 v43, s17, 49
-; SI-NEXT: s_and_b32 s63, s11, 0xffff0000
+; SI-NEXT: s_and_b32 s25, s22, 0xffff0000
+; SI-NEXT: s_lshl_b32 s35, s22, 16
+; SI-NEXT: s_and_b32 s88, s18, 0xffff0000
+; SI-NEXT: s_lshl_b32 s22, s18, 16
+; SI-NEXT: v_writelane_b32 v43, s12, 0
; SI-NEXT: s_lshl_b32 s18, s11, 16
; SI-NEXT: s_and_b32 s11, s9, 0xffff0000
-; SI-NEXT: s_and_b32 s46, s46, 0xff
-; SI-NEXT: s_add_i32 s6, s6, 0x3000000
-; SI-NEXT: v_writelane_b32 v43, s11, 50
-; SI-NEXT: s_lshl_b32 s61, s9, 16
+; SI-NEXT: s_add_i32 s7, s7, 0x3000000
+; SI-NEXT: v_writelane_b32 v43, s11, 1
+; SI-NEXT: s_lshl_b32 s9, s9, 16
+; SI-NEXT: s_add_i32 s10, s10, 0x3000000
+; SI-NEXT: v_writelane_b32 v43, s9, 2
; SI-NEXT: s_and_b32 s9, s7, 0xffff0000
+; SI-NEXT: s_and_b32 s46, s46, 0xff
+; SI-NEXT: s_and_b32 s79, s17, 0xffff0000
+; SI-NEXT: s_lshl_b32 s77, s17, 16
+; SI-NEXT: v_writelane_b32 v43, s9, 3
+; SI-NEXT: s_lshl_b32 s17, s7, 16
+; SI-NEXT: s_and_b32 s7, s10, 0xffff0000
; SI-NEXT: s_lshl_b32 s46, s46, 16
; SI-NEXT: s_addk_i32 s56, 0x300
-; SI-NEXT: s_add_i32 s8, s8, 0x3000000
-; SI-NEXT: v_writelane_b32 v43, s9, 51
-; SI-NEXT: s_lshl_b32 s17, s7, 16
-; SI-NEXT: s_and_b32 s7, s6, 0xffff0000
+; SI-NEXT: s_add_i32 s6, s6, 0x3000000
+; SI-NEXT: v_writelane_b32 v43, s7, 4
+; SI-NEXT: s_lshl_b32 s7, s10, 16
; SI-NEXT: s_or_b32 s46, s47, s46
; SI-NEXT: s_and_b32 s47, s56, 0xffff
-; SI-NEXT: v_writelane_b32 v43, s7, 52
-; SI-NEXT: s_and_b32 s7, s8, 0xffff0000
+; SI-NEXT: s_add_i32 s8, s8, 0x3000000
+; SI-NEXT: v_writelane_b32 v43, s7, 5
+; SI-NEXT: s_and_b32 s7, s6, 0xffff0000
; SI-NEXT: s_or_b32 s56, s46, s47
; SI-NEXT: s_add_i32 s47, s58, 0x3000000
; SI-NEXT: s_add_i32 s58, s59, 0x3000000
-; SI-NEXT: v_writelane_b32 v43, s7, 53
+; SI-NEXT: v_writelane_b32 v43, s7, 6
; SI-NEXT: s_lshl_b32 s7, s8, 16
-; SI-NEXT: s_add_i32 s57, s57, 0x3000000
-; SI-NEXT: v_writelane_b32 v43, s7, 54
-; SI-NEXT: s_and_b32 s7, s58, 0xffff0000
; SI-NEXT: s_add_i32 s4, s4, 0x3000000
; SI-NEXT: s_add_i32 s5, s5, 0x3000000
; SI-NEXT: s_add_i32 s46, s60, 0x3000000
; SI-NEXT: s_add_i32 s56, s56, 0x3000000
-; SI-NEXT: s_add_i32 s10, s10, 0x3000000
-; SI-NEXT: s_add_i32 s12, s12, 0x3000000
+; SI-NEXT: s_add_i32 s57, s57, 0x3000000
; SI-NEXT: s_add_i32 s16, s16, 0x3000000
-; SI-NEXT: s_add_i32 s19, s19, 0x3000000
; SI-NEXT: s_add_i32 s20, s20, 0x3000000
; SI-NEXT: s_add_i32 s24, s24, 0x3000000
-; SI-NEXT: v_writelane_b32 v43, s7, 55
-; SI-NEXT: s_and_b32 s7, s57, 0xffff0000
+; SI-NEXT: v_writelane_b32 v43, s7, 7
+; SI-NEXT: s_and_b32 s7, s58, 0xffff0000
; SI-NEXT: s_and_b32 s27, s24, 0xffff0000
; SI-NEXT: s_lshl_b32 s26, s24, 16
-; SI-NEXT: s_and_b32 s65, s20, 0xffff0000
+; SI-NEXT: s_and_b32 s24, s20, 0xffff0000
; SI-NEXT: s_lshl_b32 s20, s20, 16
-; SI-NEXT: s_and_b32 s66, s23, 0xffff0000
+; SI-NEXT: s_and_b32 s73, s23, 0xffff0000
; SI-NEXT: s_lshl_b32 s29, s23, 16
-; SI-NEXT: s_and_b32 s64, s22, 0xffff0000
-; SI-NEXT: s_lshl_b32 s95, s22, 16
-; SI-NEXT: s_and_b32 s76, s21, 0xffff0000
+; SI-NEXT: s_and_b32 s23, s21, 0xffff0000
; SI-NEXT: s_lshl_b32 s21, s21, 16
-; SI-NEXT: s_and_b32 s77, s19, 0xffff0000
-; SI-NEXT: s_lshl_b32 s22, s19, 16
-; SI-NEXT: s_and_b32 s78, s16, 0xffff0000
+; SI-NEXT: s_and_b32 s74, s16, 0xffff0000
; SI-NEXT: s_lshl_b32 s16, s16, 16
-; SI-NEXT: s_lshl_b32 s19, s13, 16
-; SI-NEXT: s_and_b32 s75, s12, 0xffff0000
-; SI-NEXT: s_lshl_b32 s79, s12, 16
-; SI-NEXT: s_and_b32 s13, s10, 0xffff0000
-; SI-NEXT: s_lshl_b32 s59, s10, 16
; SI-NEXT: s_lshl_b32 s6, s6, 16
+; SI-NEXT: s_and_b32 s60, s8, 0xffff0000
+; SI-NEXT: v_writelane_b32 v43, s7, 8
; SI-NEXT: s_lshl_b32 s99, s58, 16
-; SI-NEXT: v_writelane_b32 v43, s7, 56
+; SI-NEXT: s_and_b32 s58, s57, 0xffff0000
; SI-NEXT: s_lshl_b32 s57, s57, 16
; SI-NEXT: s_and_b32 s7, s56, 0xffff0000
; SI-NEXT: s_lshl_b32 s8, s56, 16
; SI-NEXT: s_and_b32 s56, s47, 0xffff0000
-; SI-NEXT: s_lshl_b32 s23, s47, 16
-; SI-NEXT: s_and_b32 s47, s46, 0xffff0000
+; SI-NEXT: s_lshl_b32 s47, s47, 16
+; SI-NEXT: s_and_b32 s61, s46, 0xffff0000
; SI-NEXT: s_lshl_b32 s97, s46, 16
-; SI-NEXT: s_and_b32 s24, s45, 0xffff0000
-; SI-NEXT: s_lshl_b32 s45, s45, 16
-; SI-NEXT: s_and_b32 s58, s44, 0xffff0000
+; SI-NEXT: s_and_b32 s62, s45, 0xffff0000
+; SI-NEXT: s_lshl_b32 s46, s45, 16
+; SI-NEXT: s_and_b32 s64, s44, 0xffff0000
; SI-NEXT: s_lshl_b32 s28, s44, 16
-; SI-NEXT: s_and_b32 s73, s43, 0xffff0000
-; SI-NEXT: s_lshl_b32 s46, s43, 16
+; SI-NEXT: s_and_b32 s65, s43, 0xffff0000
+; SI-NEXT: s_lshl_b32 s43, s43, 16
; SI-NEXT: s_and_b32 s67, s42, 0xffff0000
; SI-NEXT: s_lshl_b32 s87, s42, 16
; SI-NEXT: s_and_b32 s68, s41, 0xffff0000
-; SI-NEXT: s_lshl_b32 s42, s41, 16
+; SI-NEXT: s_lshl_b32 s66, s41, 16
; SI-NEXT: s_and_b32 s70, s40, 0xffff0000
; SI-NEXT: s_lshl_b32 s86, s40, 16
; SI-NEXT: s_and_b32 s94, s15, 0xffff0000
@@ -152613,99 +152622,102 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_lshl_b32 s80, s5, 16
; SI-NEXT: s_and_b32 s15, s4, 0xffff0000
; SI-NEXT: s_lshl_b32 s84, s4, 16
-; SI-NEXT: v_writelane_b32 v43, s7, 57
+; SI-NEXT: v_writelane_b32 v42, s13, 63
+; SI-NEXT: v_writelane_b32 v43, s7, 9
; SI-NEXT: .LBB89_3: ; %end
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
-; SI-NEXT: v_readlane_b32 s4, v43, 49
+; SI-NEXT: v_readlane_b32 s4, v42, 63
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
+; SI-NEXT: v_readlane_b32 s4, v43, 0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0
-; SI-NEXT: v_readlane_b32 s4, v43, 50
+; SI-NEXT: v_readlane_b32 s4, v43, 1
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_readlane_b32 s4, v43, 2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0
-; SI-NEXT: v_readlane_b32 s4, v43, 51
+; SI-NEXT: v_readlane_b32 s4, v43, 3
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
@@ -152713,14 +152725,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
+; SI-NEXT: v_readlane_b32 s4, v43, 4
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_readlane_b32 s4, v43, 5
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
-; SI-NEXT: v_readlane_b32 s4, v43, 52
+; SI-NEXT: v_readlane_b32 s4, v43, 6
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
@@ -152728,16 +152742,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
-; SI-NEXT: v_readlane_b32 s4, v43, 53
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
-; SI-NEXT: v_readlane_b32 s4, v43, 54
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60
+; SI-NEXT: v_readlane_b32 s4, v43, 7
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
-; SI-NEXT: v_readlane_b32 s4, v43, 55
+; SI-NEXT: v_readlane_b32 s4, v43, 8
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
@@ -152745,15 +152758,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
-; SI-NEXT: v_readlane_b32 s4, v43, 56
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0
-; SI-NEXT: v_readlane_b32 s4, v43, 57
+; SI-NEXT: v_readlane_b32 s4, v43, 9
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
@@ -152765,35 +152777,35 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -152807,7 +152819,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -152892,52 +152904,56 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB89_4:
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
; SI-NEXT: ; implicit-def: $sgpr8
; SI-NEXT: ; kill: killed $sgpr8
-; SI-NEXT: s_mov_b32 s7, s6
+; SI-NEXT: v_readlane_b32 s92, v44, 38
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
; SI-NEXT: ; implicit-def: $sgpr8
+; SI-NEXT: v_readlane_b32 s91, v44, 35
; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; kill: killed $sgpr8
; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: v_readlane_b32 s90, v44, 31
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: v_readlane_b32 s37, v44, 37
; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; kill: killed $sgpr8
-; SI-NEXT: v_readlane_b32 s92, v44, 24
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: v_readlane_b32 s91, v44, 20
-; SI-NEXT: s_mov_b32 s90, s88
-; SI-NEXT: v_readlane_b32 s36, v44, 23
-; SI-NEXT: v_readlane_b32 s35, v44, 19
-; SI-NEXT: v_readlane_b32 s62, v44, 22
-; SI-NEXT: v_readlane_b32 s38, v44, 18
-; SI-NEXT: s_mov_b32 s34, s46
-; SI-NEXT: s_mov_b32 s93, s21
-; SI-NEXT: s_mov_b32 s37, s43
-; SI-NEXT: s_mov_b32 s39, s75
-; SI-NEXT: v_readlane_b32 s72, v44, 10
-; SI-NEXT: s_mov_b32 s50, s63
-; SI-NEXT: s_mov_b32 s51, s59
-; SI-NEXT: s_mov_b32 s48, s56
-; SI-NEXT: v_readlane_b32 s30, v44, 21
-; SI-NEXT: s_mov_b32 s49, s61
-; SI-NEXT: s_mov_b32 s52, s79
-; SI-NEXT: v_readlane_b32 s98, v44, 6
-; SI-NEXT: s_mov_b32 s55, s45
-; SI-NEXT: v_readlane_b32 s43, v44, 17
-; SI-NEXT: s_mov_b32 s60, s40
-; SI-NEXT: v_readlane_b32 s41, v44, 14
-; SI-NEXT: s_mov_b32 s53, s42
-; SI-NEXT: s_mov_b32 s54, s13
-; SI-NEXT: v_readlane_b32 s14, v44, 13
-; SI-NEXT: v_readlane_b32 s44, v44, 5
-; SI-NEXT: v_readlane_b32 s9, v44, 11
-; SI-NEXT: v_readlane_b32 s81, v44, 12
-; SI-NEXT: v_readlane_b32 s82, v44, 9
-; SI-NEXT: v_readlane_b32 s10, v44, 16
-; SI-NEXT: v_readlane_b32 s12, v44, 4
-; SI-NEXT: v_readlane_b32 s96, v44, 7
-; SI-NEXT: v_readlane_b32 s83, v44, 8
-; SI-NEXT: v_readlane_b32 s71, v44, 15
+; SI-NEXT: ; kill: killed $sgpr6
+; SI-NEXT: v_readlane_b32 s36, v44, 34
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: v_readlane_b32 s30, v44, 36
+; SI-NEXT: v_readlane_b32 s53, v44, 33
+; SI-NEXT: v_readlane_b32 s34, v44, 27
+; SI-NEXT: v_readlane_b32 s93, v44, 19
+; SI-NEXT: v_readlane_b32 s38, v44, 26
+; SI-NEXT: v_readlane_b32 s75, v44, 18
+; SI-NEXT: v_readlane_b32 s72, v44, 11
+; SI-NEXT: v_readlane_b32 s63, v44, 23
+; SI-NEXT: v_readlane_b32 s59, v44, 24
+; SI-NEXT: v_readlane_b32 s48, v44, 25
+; SI-NEXT: s_mov_b32 s39, s43
+; SI-NEXT: v_readlane_b32 s52, v44, 22
+; SI-NEXT: v_readlane_b32 s49, v44, 32
+; SI-NEXT: v_readlane_b32 s98, v44, 7
+; SI-NEXT: v_readlane_b32 s54, v44, 30
+; SI-NEXT: v_readlane_b32 s55, v44, 20
+; SI-NEXT: v_readlane_b32 s51, v44, 21
+; SI-NEXT: v_readlane_b32 s41, v44, 15
+; SI-NEXT: v_readlane_b32 s50, v44, 28
+; SI-NEXT: v_readlane_b32 s13, v44, 29
+; SI-NEXT: v_readlane_b32 s14, v44, 14
+; SI-NEXT: v_readlane_b32 s44, v44, 6
+; SI-NEXT: v_readlane_b32 s9, v44, 12
+; SI-NEXT: v_readlane_b32 s81, v44, 13
+; SI-NEXT: v_readlane_b32 s82, v44, 10
+; SI-NEXT: v_readlane_b32 s10, v44, 17
+; SI-NEXT: v_readlane_b32 s7, v44, 4
+; SI-NEXT: v_readlane_b32 s12, v44, 5
+; SI-NEXT: v_readlane_b32 s96, v44, 8
+; SI-NEXT: v_readlane_b32 s83, v44, 9
+; SI-NEXT: v_readlane_b32 s71, v44, 16
; SI-NEXT: ; kill: killed $sgpr6
; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: ; kill: killed $sgpr8
@@ -152946,50 +152962,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; implicit-def: $sgpr27
; SI-NEXT: ; implicit-def: $sgpr20
-; SI-NEXT: ; implicit-def: $sgpr65
+; SI-NEXT: ; implicit-def: $sgpr24
; SI-NEXT: ; implicit-def: $sgpr29
-; SI-NEXT: ; implicit-def: $sgpr66
-; SI-NEXT: ; implicit-def: $sgpr95
-; SI-NEXT: ; implicit-def: $sgpr64
+; SI-NEXT: ; implicit-def: $sgpr73
+; SI-NEXT: ; implicit-def: $sgpr35
+; SI-NEXT: ; implicit-def: $sgpr25
; SI-NEXT: ; implicit-def: $sgpr21
+; SI-NEXT: ; implicit-def: $sgpr23
+; SI-NEXT: ; implicit-def: $sgpr95
; SI-NEXT: ; implicit-def: $sgpr76
; SI-NEXT: ; implicit-def: $sgpr22
-; SI-NEXT: ; implicit-def: $sgpr77
-; SI-NEXT: ; implicit-def: $sgpr25
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr16
-; SI-NEXT: ; implicit-def: $sgpr78
; SI-NEXT: ; implicit-def: $sgpr88
-; SI-NEXT: ; implicit-def: $sgpr89
-; SI-NEXT: ; implicit-def: $sgpr19
+; SI-NEXT: ; implicit-def: $sgpr16
+; SI-NEXT: ; implicit-def: $sgpr74
+; SI-NEXT: ; implicit-def: $sgpr77
; SI-NEXT: ; implicit-def: $sgpr79
-; SI-NEXT: ; implicit-def: $sgpr75
+; SI-NEXT: ; implicit-def: $sgpr19
+; SI-NEXT: ; implicit-def: $sgpr89
+; SI-NEXT: ; implicit-def: $sgpr78
; SI-NEXT: ; implicit-def: $sgpr18
-; SI-NEXT: ; implicit-def: $sgpr63
-; SI-NEXT: ; implicit-def: $sgpr61
; SI-NEXT: ; implicit-def: $sgpr17
; SI-NEXT: ; kill: killed $sgpr6
-; SI-NEXT: ; implicit-def: $sgpr59
-; SI-NEXT: ; implicit-def: $sgpr13
; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; implicit-def: $sgpr99
-; SI-NEXT: ; implicit-def: $sgpr57
; SI-NEXT: ; kill: killed $sgpr8
+; SI-NEXT: ; implicit-def: $sgpr57
+; SI-NEXT: ; implicit-def: $sgpr58
; SI-NEXT: ; implicit-def: $sgpr8
; SI-NEXT: ; kill: killed $sgpr11
-; SI-NEXT: ; implicit-def: $sgpr23
+; SI-NEXT: ; implicit-def: $sgpr47
; SI-NEXT: ; implicit-def: $sgpr56
; SI-NEXT: ; implicit-def: $sgpr97
-; SI-NEXT: ; implicit-def: $sgpr47
-; SI-NEXT: ; implicit-def: $sgpr45
-; SI-NEXT: ; implicit-def: $sgpr24
-; SI-NEXT: ; implicit-def: $sgpr28
-; SI-NEXT: ; implicit-def: $sgpr58
+; SI-NEXT: ; implicit-def: $sgpr61
; SI-NEXT: ; implicit-def: $sgpr46
-; SI-NEXT: ; implicit-def: $sgpr73
+; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr64
+; SI-NEXT: ; implicit-def: $sgpr43
+; SI-NEXT: ; implicit-def: $sgpr65
; SI-NEXT: ; implicit-def: $sgpr87
; SI-NEXT: ; implicit-def: $sgpr67
-; SI-NEXT: ; implicit-def: $sgpr42
+; SI-NEXT: ; implicit-def: $sgpr66
; SI-NEXT: ; implicit-def: $sgpr68
; SI-NEXT: ; implicit-def: $sgpr86
; SI-NEXT: ; implicit-def: $sgpr70
@@ -177734,12 +177748,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane
-; SI-NEXT: s_mov_b32 s10, s16
-; SI-NEXT: s_waitcnt expcnt(1)
-; SI-NEXT: v_writelane_b32 v61, s29, 0
-; SI-NEXT: v_writelane_b32 v61, s28, 1
-; SI-NEXT: v_writelane_b32 v61, s27, 2
-; SI-NEXT: s_mov_b32 s61, s21
+; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_writelane_b32 v63, s30, 0
; SI-NEXT: v_writelane_b32 v63, s31, 1
; SI-NEXT: v_writelane_b32 v63, s34, 2
@@ -177773,59 +177782,58 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_writelane_b32 v63, s86, 30
; SI-NEXT: v_writelane_b32 v63, s87, 31
; SI-NEXT: v_writelane_b32 v63, s96, 32
+; SI-NEXT: s_mov_b32 s10, s16
; SI-NEXT: v_writelane_b32 v63, s97, 33
-; SI-NEXT: s_mov_b32 s67, s19
-; SI-NEXT: s_mov_b32 s54, s17
-; SI-NEXT: s_mov_b32 s35, s23
-; SI-NEXT: s_mov_b32 s39, s26
-; SI-NEXT: s_mov_b32 s62, s25
+; SI-NEXT: s_mov_b32 s50, s29
; SI-NEXT: v_writelane_b32 v63, s98, 34
; SI-NEXT: v_writelane_b32 v63, s99, 35
; SI-NEXT: v_readfirstlane_b32 s99, v1
-; SI-NEXT: v_readfirstlane_b32 s74, v24
+; SI-NEXT: v_readfirstlane_b32 s44, v22
; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; SI-NEXT: v_readfirstlane_b32 s6, v23
+; SI-NEXT: v_readfirstlane_b32 s73, v21
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v62, s74, 0
-; SI-NEXT: v_readfirstlane_b32 s12, v26
-; SI-NEXT: v_writelane_b32 v62, s6, 1
+; SI-NEXT: v_writelane_b32 v62, s44, 0
+; SI-NEXT: v_readfirstlane_b32 s74, v24
+; SI-NEXT: v_writelane_b32 v62, s73, 1
+; SI-NEXT: s_mov_b32 s60, s20
+; SI-NEXT: v_readfirstlane_b32 s69, v23
+; SI-NEXT: v_writelane_b32 v62, s74, 2
+; SI-NEXT: v_readfirstlane_b32 s11, v26
+; SI-NEXT: v_writelane_b32 v62, s69, 3
; SI-NEXT: v_readfirstlane_b32 s14, v25
-; SI-NEXT: v_writelane_b32 v62, s12, 2
+; SI-NEXT: v_writelane_b32 v62, s11, 4
; SI-NEXT: v_readfirstlane_b32 s46, v28
-; SI-NEXT: v_writelane_b32 v62, s14, 3
+; SI-NEXT: v_writelane_b32 v62, s14, 5
+; SI-NEXT: s_mov_b32 s68, s24
; SI-NEXT: v_readfirstlane_b32 s56, v27
-; SI-NEXT: v_writelane_b32 v62, s46, 4
+; SI-NEXT: v_writelane_b32 v62, s46, 6
+; SI-NEXT: s_mov_b32 s76, s27
; SI-NEXT: v_readfirstlane_b32 s57, v30
-; SI-NEXT: v_writelane_b32 v62, s56, 5
-; SI-NEXT: v_readfirstlane_b32 s59, v29
-; SI-NEXT: v_writelane_b32 v62, s57, 6
-; SI-NEXT: v_writelane_b32 v62, s59, 7
-; SI-NEXT: s_mov_b32 s60, s20
-; SI-NEXT: s_mov_b32 s63, s24
+; SI-NEXT: v_writelane_b32 v62, s56, 7
+; SI-NEXT: v_writelane_b32 v62, s57, 8
; SI-NEXT: v_readfirstlane_b32 s95, v3
; SI-NEXT: v_readfirstlane_b32 s31, v5
; SI-NEXT: v_readfirstlane_b32 s24, v9
; SI-NEXT: v_readfirstlane_b32 s38, v12
; SI-NEXT: v_readfirstlane_b32 s36, v11
-; SI-NEXT: v_readfirstlane_b32 s8, v14
-; SI-NEXT: v_readfirstlane_b32 s27, v13
; SI-NEXT: v_readfirstlane_b32 s9, v16
; SI-NEXT: v_readfirstlane_b32 s79, v15
; SI-NEXT: v_readfirstlane_b32 s13, v18
-; SI-NEXT: v_readfirstlane_b32 s15, v17
+; SI-NEXT: v_readfirstlane_b32 s40, v17
; SI-NEXT: v_readfirstlane_b32 s42, v20
; SI-NEXT: v_readfirstlane_b32 s43, v19
-; SI-NEXT: v_readfirstlane_b32 s44, v22
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328
-; SI-NEXT: v_writelane_b32 v61, s4, 3
-; SI-NEXT: v_readfirstlane_b32 s45, v21
+; SI-NEXT: v_readfirstlane_b32 s89, v29
+; SI-NEXT: v_readfirstlane_b32 s8, v14
+; SI-NEXT: v_readfirstlane_b32 s27, v13
; SI-NEXT: v_readfirstlane_b32 s98, v10
; SI-NEXT: v_readfirstlane_b32 s90, v8
; SI-NEXT: v_readfirstlane_b32 s88, v7
; SI-NEXT: v_readfirstlane_b32 s91, v6
-; SI-NEXT: v_readfirstlane_b32 s93, v4
+; SI-NEXT: v_readfirstlane_b32 s6, v4
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328
+; SI-NEXT: v_writelane_b32 v61, s4, 0
; SI-NEXT: v_readfirstlane_b32 s55, v2
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
@@ -177844,389 +177852,374 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324
-; SI-NEXT: v_writelane_b32 v61, s4, 4
+; SI-NEXT: v_writelane_b32 v61, s4, 1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320
-; SI-NEXT: v_writelane_b32 v61, s4, 5
+; SI-NEXT: v_writelane_b32 v61, s4, 2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316
-; SI-NEXT: v_writelane_b32 v61, s4, 6
+; SI-NEXT: v_writelane_b32 v61, s4, 3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312
-; SI-NEXT: v_writelane_b32 v61, s4, 7
+; SI-NEXT: v_writelane_b32 v61, s4, 4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308
-; SI-NEXT: v_writelane_b32 v61, s4, 8
+; SI-NEXT: v_writelane_b32 v61, s4, 5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304
-; SI-NEXT: v_writelane_b32 v61, s4, 9
+; SI-NEXT: v_writelane_b32 v61, s4, 6
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300
-; SI-NEXT: v_writelane_b32 v61, s4, 10
+; SI-NEXT: v_writelane_b32 v61, s4, 7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296
-; SI-NEXT: v_writelane_b32 v61, s4, 11
+; SI-NEXT: v_writelane_b32 v61, s4, 8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292
-; SI-NEXT: v_writelane_b32 v61, s4, 12
+; SI-NEXT: v_writelane_b32 v61, s4, 9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288
-; SI-NEXT: v_writelane_b32 v61, s4, 13
+; SI-NEXT: v_writelane_b32 v61, s4, 10
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284
-; SI-NEXT: v_writelane_b32 v61, s4, 14
+; SI-NEXT: v_writelane_b32 v61, s4, 11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280
-; SI-NEXT: v_writelane_b32 v61, s4, 15
+; SI-NEXT: v_writelane_b32 v61, s4, 12
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s67, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276
-; SI-NEXT: v_writelane_b32 v61, s4, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s54, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272
-; SI-NEXT: v_writelane_b32 v61, s4, 17
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s65, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268
-; SI-NEXT: v_writelane_b32 v61, s4, 18
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s70, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264
-; SI-NEXT: v_writelane_b32 v61, s4, 19
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s71, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260
-; SI-NEXT: v_writelane_b32 v61, s4, 20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s49, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256
-; SI-NEXT: v_writelane_b32 v61, s4, 21
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s83, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252
-; SI-NEXT: v_writelane_b32 v61, s4, 22
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s80, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248
-; SI-NEXT: v_writelane_b32 v61, s4, 23
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s82, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244
-; SI-NEXT: v_writelane_b32 v61, s4, 24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s84, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240
-; SI-NEXT: v_writelane_b32 v61, s4, 25
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s87, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236
-; SI-NEXT: v_writelane_b32 v61, s4, 26
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s86, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232
-; SI-NEXT: v_writelane_b32 v61, s4, 27
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s51, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228
-; SI-NEXT: v_writelane_b32 v61, s4, 28
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s96, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224
-; SI-NEXT: v_writelane_b32 v61, s4, 29
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220
-; SI-NEXT: v_writelane_b32 v61, s4, 30
+; SI-NEXT: v_writelane_b32 v61, s4, 13
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s94, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216
-; SI-NEXT: v_writelane_b32 v61, s4, 31
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212
-; SI-NEXT: v_writelane_b32 v61, s4, 32
+; SI-NEXT: v_writelane_b32 v61, s4, 14
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s16, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204
-; SI-NEXT: v_writelane_b32 v61, s4, 33
+; SI-NEXT: v_writelane_b32 v61, s4, 15
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s89, v31
+; SI-NEXT: v_readfirstlane_b32 s15, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196
-; SI-NEXT: v_writelane_b32 v61, s4, 34
+; SI-NEXT: v_writelane_b32 v61, s4, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s73, v31
+; SI-NEXT: v_readfirstlane_b32 s45, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188
-; SI-NEXT: v_writelane_b32 v61, s4, 35
+; SI-NEXT: v_writelane_b32 v61, s4, 17
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s72, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184
+; SI-NEXT: v_writelane_b32 v61, s4, 18
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s40, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180
+; SI-NEXT: v_writelane_b32 v61, s4, 19
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s21, v31
+; SI-NEXT: v_readfirstlane_b32 s85, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s85, v31
+; SI-NEXT: v_readfirstlane_b32 s81, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s81, v31
+; SI-NEXT: v_readfirstlane_b32 s97, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s97, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164
+; SI-NEXT: v_writelane_b32 v61, s4, 20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s7, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160
+; SI-NEXT: v_writelane_b32 v61, s4, 21
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s11, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156
+; SI-NEXT: v_writelane_b32 v61, s4, 22
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s41, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s47, v31
+; SI-NEXT: v_readfirstlane_b32 s12, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_readfirstlane_b32 s58, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s76, v31
+; SI-NEXT: v_readfirstlane_b32 s47, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s29, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136
+; SI-NEXT: v_writelane_b32 v61, s4, 23
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
+; SI-NEXT: v_readfirstlane_b32 s59, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132
-; SI-NEXT: v_writelane_b32 v61, s4, 36
-; SI-NEXT: v_writelane_b32 v61, s54, 37
-; SI-NEXT: v_writelane_b32 v61, s10, 38
-; SI-NEXT: v_writelane_b32 v61, s67, 39
-; SI-NEXT: v_writelane_b32 v61, s18, 40
-; SI-NEXT: v_writelane_b32 v61, s61, 41
-; SI-NEXT: v_writelane_b32 v61, s60, 42
-; SI-NEXT: v_writelane_b32 v61, s35, 43
-; SI-NEXT: v_writelane_b32 v61, s22, 44
-; SI-NEXT: v_writelane_b32 v61, s62, 45
-; SI-NEXT: v_writelane_b32 v61, s63, 46
-; SI-NEXT: v_writelane_b32 v61, s39, 47
-; SI-NEXT: v_writelane_b32 v61, s99, 48
-; SI-NEXT: v_writelane_b32 v61, s95, 49
-; SI-NEXT: v_writelane_b32 v61, s31, 50
-; SI-NEXT: v_writelane_b32 v61, s24, 51
-; SI-NEXT: v_writelane_b32 v61, s38, 52
-; SI-NEXT: v_writelane_b32 v61, s36, 53
-; SI-NEXT: v_writelane_b32 v61, s8, 54
-; SI-NEXT: v_writelane_b32 v61, s27, 55
-; SI-NEXT: v_writelane_b32 v61, s9, 56
-; SI-NEXT: v_writelane_b32 v61, s79, 57
-; SI-NEXT: v_writelane_b32 v61, s13, 58
-; SI-NEXT: v_writelane_b32 v61, s15, 59
-; SI-NEXT: v_writelane_b32 v61, s42, 60
-; SI-NEXT: v_writelane_b32 v61, s43, 61
-; SI-NEXT: v_writelane_b32 v61, s44, 62
-; SI-NEXT: v_writelane_b32 v61, s45, 63
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s37, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
+; SI-NEXT: v_writelane_b32 v61, s4, 24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s50, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; SI-NEXT: v_writelane_b32 v61, s4, 25
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s48, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; SI-NEXT: v_writelane_b32 v61, s4, 26
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s19, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; SI-NEXT: v_writelane_b32 v61, s4, 27
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s64, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; SI-NEXT: v_writelane_b32 v61, s4, 28
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s17, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
+; SI-NEXT: v_writelane_b32 v61, s4, 29
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s65, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; SI-NEXT: v_writelane_b32 v61, s4, 30
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s71, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; SI-NEXT: v_writelane_b32 v61, s4, 31
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s70, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT: v_writelane_b32 v61, s4, 32
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s83, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
+; SI-NEXT: v_writelane_b32 v61, s4, 33
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s49, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; SI-NEXT: v_writelane_b32 v61, s4, 34
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s80, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
+; SI-NEXT: v_writelane_b32 v61, s4, 35
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s82, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; SI-NEXT: v_writelane_b32 v61, s4, 36
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s87, v31
+; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
+; SI-NEXT: v_writelane_b32 v61, s4, 37
+; SI-NEXT: v_writelane_b32 v61, s17, 38
+; SI-NEXT: v_writelane_b32 v61, s10, 39
+; SI-NEXT: v_writelane_b32 v61, s19, 40
+; SI-NEXT: v_writelane_b32 v61, s18, 41
+; SI-NEXT: v_writelane_b32 v61, s21, 42
+; SI-NEXT: v_writelane_b32 v61, s60, 43
+; SI-NEXT: v_writelane_b32 v61, s23, 44
+; SI-NEXT: v_writelane_b32 v61, s22, 45
+; SI-NEXT: v_writelane_b32 v61, s25, 46
+; SI-NEXT: v_writelane_b32 v61, s68, 47
+; SI-NEXT: v_writelane_b32 v61, s76, 48
+; SI-NEXT: v_writelane_b32 v61, s26, 49
+; SI-NEXT: v_writelane_b32 v61, s50, 50
+; SI-NEXT: v_writelane_b32 v61, s99, 51
+; SI-NEXT: v_writelane_b32 v61, s28, 52
+; SI-NEXT: v_writelane_b32 v61, s95, 53
+; SI-NEXT: v_writelane_b32 v61, s31, 54
+; SI-NEXT: v_writelane_b32 v61, s24, 55
+; SI-NEXT: v_writelane_b32 v61, s38, 56
+; SI-NEXT: v_writelane_b32 v61, s36, 57
+; SI-NEXT: v_writelane_b32 v61, s9, 58
+; SI-NEXT: v_writelane_b32 v61, s79, 59
+; SI-NEXT: v_writelane_b32 v61, s13, 60
+; SI-NEXT: v_writelane_b32 v61, s40, 61
+; SI-NEXT: v_writelane_b32 v61, s42, 62
+; SI-NEXT: v_writelane_b32 v61, s43, 63
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s84, v31
+; SI-NEXT: v_readfirstlane_b32 s78, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s51, v31
+; SI-NEXT: v_readfirstlane_b32 s29, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s86, v31
+; SI-NEXT: v_readfirstlane_b32 s75, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s94, v31
+; SI-NEXT: v_readfirstlane_b32 s77, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s96, v31
+; SI-NEXT: v_readfirstlane_b32 s30, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s68, v31
+; SI-NEXT: v_readfirstlane_b32 s92, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s34, v31
+; SI-NEXT: v_readfirstlane_b32 s35, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s77, v31
+; SI-NEXT: v_readfirstlane_b32 s39, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s66, v31
+; SI-NEXT: v_readfirstlane_b32 s64, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s78, v31
+; SI-NEXT: v_readfirstlane_b32 s48, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s53, v31
+; SI-NEXT: v_readfirstlane_b32 s52, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s69, v31
+; SI-NEXT: v_readfirstlane_b32 s37, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s30, v31
+; SI-NEXT: v_readfirstlane_b32 s63, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s52, v31
+; SI-NEXT: v_readfirstlane_b32 s34, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s75, v31
+; SI-NEXT: v_readfirstlane_b32 s62, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s23, v31
+; SI-NEXT: v_readfirstlane_b32 s7, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s28, v31
+; SI-NEXT: v_readfirstlane_b32 s72, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s26, v31
+; SI-NEXT: v_readfirstlane_b32 s66, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s25, v31
+; SI-NEXT: v_readfirstlane_b32 s93, v31
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT: v_writelane_b32 v62, s25, 8
-; SI-NEXT: v_writelane_b32 v62, s28, 9
+; SI-NEXT: v_writelane_b32 v62, s93, 9
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s92, v31
-; SI-NEXT: v_writelane_b32 v62, s92, 10
-; SI-NEXT: v_writelane_b32 v62, s75, 11
-; SI-NEXT: v_writelane_b32 v62, s26, 12
-; SI-NEXT: v_writelane_b32 v62, s30, 13
-; SI-NEXT: v_writelane_b32 v62, s23, 14
-; SI-NEXT: v_writelane_b32 v62, s52, 15
-; SI-NEXT: v_writelane_b32 v62, s64, 16
-; SI-NEXT: v_writelane_b32 v62, s17, 17
-; SI-NEXT: v_writelane_b32 v62, s65, 18
-; SI-NEXT: v_writelane_b32 v62, s70, 19
-; SI-NEXT: v_writelane_b32 v62, s71, 20
-; SI-NEXT: v_writelane_b32 v62, s49, 21
-; SI-NEXT: v_writelane_b32 v62, s83, 22
-; SI-NEXT: v_writelane_b32 v62, s80, 23
-; SI-NEXT: v_writelane_b32 v62, s82, 24
-; SI-NEXT: v_writelane_b32 v62, s84, 25
-; SI-NEXT: v_writelane_b32 v62, s87, 26
-; SI-NEXT: v_writelane_b32 v62, s86, 27
-; SI-NEXT: v_writelane_b32 v62, s51, 28
-; SI-NEXT: v_writelane_b32 v62, s96, 29
-; SI-NEXT: v_writelane_b32 v62, s34, 30
-; SI-NEXT: v_writelane_b32 v62, s94, 31
-; SI-NEXT: v_writelane_b32 v62, s53, 32
-; SI-NEXT: v_writelane_b32 v62, s66, 33
-; SI-NEXT: v_writelane_b32 v62, s68, 34
-; SI-NEXT: v_writelane_b32 v62, s69, 35
-; SI-NEXT: v_writelane_b32 v62, s77, 36
-; SI-NEXT: v_writelane_b32 v62, s78, 37
-; SI-NEXT: s_cbranch_scc0 .LBB93_4
+; SI-NEXT: v_readfirstlane_b32 s53, v31
+; SI-NEXT: v_writelane_b32 v62, s53, 10
+; SI-NEXT: v_writelane_b32 v62, s66, 11
+; SI-NEXT: v_writelane_b32 v62, s7, 12
+; SI-NEXT: v_writelane_b32 v62, s78, 13
+; SI-NEXT: v_writelane_b32 v62, s77, 14
+; SI-NEXT: v_writelane_b32 v62, s92, 15
+; SI-NEXT: v_writelane_b32 v62, s75, 16
+; SI-NEXT: v_writelane_b32 v62, s37, 17
+; SI-NEXT: v_writelane_b32 v62, s39, 18
+; SI-NEXT: v_writelane_b32 v62, s30, 19
+; SI-NEXT: v_writelane_b32 v62, s48, 20
+; SI-NEXT: v_writelane_b32 v62, s35, 21
+; SI-NEXT: v_writelane_b32 v62, s52, 22
+; SI-NEXT: v_writelane_b32 v62, s64, 23
+; SI-NEXT: s_cbranch_scc0 .LBB93_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_and_b32 s4, s10, 0xff
-; SI-NEXT: s_lshl_b32 s5, s54, 8
+; SI-NEXT: s_lshl_b32 s5, s17, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
; SI-NEXT: s_and_b32 s4, s18, 0xff
-; SI-NEXT: s_lshl_b32 s5, s67, 8
+; SI-NEXT: s_lshl_b32 s5, s19, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
; SI-NEXT: s_and_b32 s4, s60, 0xff
-; SI-NEXT: s_lshl_b32 s5, s61, 8
+; SI-NEXT: s_lshl_b32 s5, s21, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
; SI-NEXT: s_and_b32 s4, s22, 0xff
-; SI-NEXT: s_lshl_b32 s5, s35, 8
+; SI-NEXT: s_lshl_b32 s5, s23, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT: s_and_b32 s4, s63, 0xff
-; SI-NEXT: s_lshl_b32 s5, s62, 8
+; SI-NEXT: s_and_b32 s4, s68, 0xff
+; SI-NEXT: s_lshl_b32 s5, s25, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT: v_readlane_b32 s5, v61, 2
-; SI-NEXT: s_and_b32 s4, s39, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 8
+; SI-NEXT: s_and_b32 s4, s26, 0xff
+; SI-NEXT: s_lshl_b32 s5, s76, 8
+; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
+; SI-NEXT: s_and_b32 s4, s28, 0xff
+; SI-NEXT: s_lshl_b32 s5, s50, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; SI-NEXT: v_readlane_b32 s4, v61, 1
-; SI-NEXT: v_readlane_b32 s5, v61, 0
-; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
; SI-NEXT: s_and_b32 s4, s99, 0xff
; SI-NEXT: s_lshl_b32 s5, s55, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v10, s4
; SI-NEXT: s_and_b32 s4, s95, 0xff
-; SI-NEXT: s_lshl_b32 s5, s93, 8
+; SI-NEXT: s_lshl_b32 s5, s6, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v8, s4
; SI-NEXT: s_and_b32 s4, s31, 0xff
@@ -178253,7 +178246,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_lshl_b32 s5, s9, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v14, s4
-; SI-NEXT: s_and_b32 s4, s15, 0xff
+; SI-NEXT: s_and_b32 s4, s40, 0xff
; SI-NEXT: s_lshl_b32 s5, s13, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v17, s4
@@ -178261,231 +178254,230 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_lshl_b32 s5, s42, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v16, s4
-; SI-NEXT: s_and_b32 s4, s45, 0xff
+; SI-NEXT: s_and_b32 s4, s73, 0xff
; SI-NEXT: s_lshl_b32 s5, s44, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v19, s4
-; SI-NEXT: s_and_b32 s4, s6, 0xff
+; SI-NEXT: s_and_b32 s4, s69, 0xff
; SI-NEXT: s_lshl_b32 s5, s74, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v18, s4
; SI-NEXT: s_and_b32 s4, s14, 0xff
-; SI-NEXT: s_lshl_b32 s5, s12, 8
+; SI-NEXT: s_lshl_b32 s5, s11, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v21, s4
; SI-NEXT: s_and_b32 s4, s56, 0xff
; SI-NEXT: s_lshl_b32 s5, s46, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v20, s4
-; SI-NEXT: s_and_b32 s4, s59, 0xff
+; SI-NEXT: s_and_b32 s4, s89, 0xff
; SI-NEXT: s_lshl_b32 s5, s57, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v23, s4
-; SI-NEXT: s_and_b32 s4, s92, 0xff
-; SI-NEXT: s_lshl_b32 s5, s25, 8
+; SI-NEXT: s_and_b32 s4, s53, 0xff
+; SI-NEXT: s_lshl_b32 s5, s93, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v22, s4
-; SI-NEXT: s_and_b32 s4, s26, 0xff
-; SI-NEXT: s_lshl_b32 s5, s28, 8
+; SI-NEXT: s_and_b32 s4, s66, 0xff
+; SI-NEXT: s_lshl_b32 s5, s72, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v25, s4
-; SI-NEXT: s_and_b32 s4, s23, 0xff
-; SI-NEXT: s_lshl_b32 s5, s75, 8
+; SI-NEXT: s_and_b32 s4, s7, 0xff
+; SI-NEXT: s_lshl_b32 s5, s62, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v24, s4
-; SI-NEXT: s_and_b32 s4, s52, 0xff
-; SI-NEXT: s_lshl_b32 s5, s30, 8
+; SI-NEXT: s_and_b32 s4, s34, 0xff
+; SI-NEXT: s_lshl_b32 s5, s63, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v27, s4
-; SI-NEXT: s_and_b32 s4, s69, 0xff
-; SI-NEXT: s_lshl_b32 s5, s53, 8
+; SI-NEXT: s_and_b32 s4, s37, 0xff
+; SI-NEXT: s_lshl_b32 s5, s52, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v26, s4
-; SI-NEXT: s_and_b32 s4, s78, 0xff
-; SI-NEXT: s_lshl_b32 s5, s66, 8
+; SI-NEXT: s_and_b32 s4, s48, 0xff
+; SI-NEXT: s_lshl_b32 s5, s64, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v29, s4
-; SI-NEXT: s_and_b32 s4, s77, 0xff
-; SI-NEXT: s_lshl_b32 s5, s34, 8
+; SI-NEXT: s_and_b32 s4, s39, 0xff
+; SI-NEXT: s_lshl_b32 s5, s35, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v28, s4
-; SI-NEXT: s_and_b32 s4, s68, 0xff
-; SI-NEXT: s_lshl_b32 s5, s96, 8
+; SI-NEXT: s_and_b32 s4, s92, 0xff
+; SI-NEXT: s_lshl_b32 s5, s30, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v31, s4
-; SI-NEXT: s_and_b32 s4, s94, 0xff
-; SI-NEXT: s_lshl_b32 s5, s86, 8
+; SI-NEXT: s_and_b32 s4, s77, 0xff
+; SI-NEXT: s_lshl_b32 s5, s75, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v30, s4
-; SI-NEXT: s_and_b32 s4, s51, 0xff
-; SI-NEXT: s_lshl_b32 s5, s84, 8
+; SI-NEXT: s_and_b32 s4, s29, 0xff
+; SI-NEXT: s_lshl_b32 s5, s78, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s79, v61, 37
+; SI-NEXT: v_readlane_b32 s57, v61, 36
; SI-NEXT: v_cvt_f32_f16_e32 v33, s4
-; SI-NEXT: s_and_b32 s4, s87, 0xff
-; SI-NEXT: s_lshl_b32 s5, s82, 8
+; SI-NEXT: s_and_b32 s4, s79, 0xff
+; SI-NEXT: s_lshl_b32 s5, s57, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s46, v61, 35
+; SI-NEXT: v_readlane_b32 s56, v61, 34
; SI-NEXT: v_cvt_f32_f16_e32 v32, s4
-; SI-NEXT: s_and_b32 s4, s80, 0xff
-; SI-NEXT: s_lshl_b32 s5, s49, 8
+; SI-NEXT: s_and_b32 s4, s46, 0xff
+; SI-NEXT: s_lshl_b32 s5, s56, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s43, v61, 33
+; SI-NEXT: v_readlane_b32 s44, v61, 32
; SI-NEXT: v_cvt_f32_f16_e32 v35, s4
-; SI-NEXT: s_and_b32 s4, s83, 0xff
-; SI-NEXT: s_lshl_b32 s5, s70, 8
+; SI-NEXT: s_and_b32 s4, s43, 0xff
+; SI-NEXT: s_lshl_b32 s5, s44, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s40, v61, 31
+; SI-NEXT: v_readlane_b32 s42, v61, 30
; SI-NEXT: v_cvt_f32_f16_e32 v34, s4
-; SI-NEXT: s_and_b32 s4, s71, 0xff
-; SI-NEXT: s_lshl_b32 s5, s65, 8
+; SI-NEXT: s_and_b32 s4, s40, 0xff
+; SI-NEXT: s_lshl_b32 s5, s42, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s22, v61, 29
+; SI-NEXT: v_readlane_b32 s26, v61, 28
; SI-NEXT: v_cvt_f32_f16_e32 v37, s4
-; SI-NEXT: s_and_b32 s4, s17, 0xff
-; SI-NEXT: s_lshl_b32 s5, s64, 8
+; SI-NEXT: s_and_b32 s4, s22, 0xff
+; SI-NEXT: s_lshl_b32 s5, s26, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s28, v61, 27
+; SI-NEXT: v_readlane_b32 s18, v61, 26
; SI-NEXT: v_cvt_f32_f16_e32 v36, s4
-; SI-NEXT: s_and_b32 s4, s19, 0xff
-; SI-NEXT: s_lshl_b32 s5, s48, 8
+; SI-NEXT: s_and_b32 s4, s28, 0xff
+; SI-NEXT: s_lshl_b32 s5, s18, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s14, v61, 25
+; SI-NEXT: v_readlane_b32 s13, v61, 24
; SI-NEXT: v_cvt_f32_f16_e32 v39, s4
-; SI-NEXT: s_and_b32 s4, s50, 0xff
-; SI-NEXT: s_lshl_b32 s5, s37, 8
+; SI-NEXT: s_and_b32 s4, s14, 0xff
+; SI-NEXT: s_lshl_b32 s5, s13, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s8, v61, 36
+; SI-NEXT: v_readlane_b32 s11, v61, 23
; SI-NEXT: v_cvt_f32_f16_e32 v38, s4
-; SI-NEXT: s_and_b32 s4, s8, 0xff
-; SI-NEXT: s_lshl_b32 s5, s29, 8
+; SI-NEXT: s_and_b32 s4, s59, 0xff
+; SI-NEXT: s_lshl_b32 s5, s11, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v49, s4
-; SI-NEXT: s_and_b32 s4, s76, 0xff
+; SI-NEXT: s_and_b32 s4, s47, 0xff
; SI-NEXT: s_lshl_b32 s5, s58, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v48, s4
-; SI-NEXT: s_and_b32 s4, s47, 0xff
+; SI-NEXT: s_and_b32 s4, s12, 0xff
; SI-NEXT: s_lshl_b32 s5, s41, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s25, v61, 22
+; SI-NEXT: v_readlane_b32 s10, v61, 21
; SI-NEXT: v_cvt_f32_f16_e32 v51, s4
-; SI-NEXT: s_and_b32 s4, s11, 0xff
-; SI-NEXT: s_lshl_b32 s5, s7, 8
+; SI-NEXT: s_and_b32 s4, s25, 0xff
+; SI-NEXT: s_lshl_b32 s5, s10, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s21, v61, 20
; SI-NEXT: v_cvt_f32_f16_e32 v50, s4
-; SI-NEXT: s_and_b32 s4, s97, 0xff
-; SI-NEXT: s_lshl_b32 s5, s81, 8
+; SI-NEXT: s_and_b32 s4, s21, 0xff
+; SI-NEXT: s_lshl_b32 s5, s97, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_cvt_f32_f16_e32 v53, s4
-; SI-NEXT: s_and_b32 s4, s85, 0xff
-; SI-NEXT: s_lshl_b32 s5, s21, 8
+; SI-NEXT: s_and_b32 s4, s81, 0xff
+; SI-NEXT: s_lshl_b32 s5, s85, 8
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s9, v61, 19
+; SI-NEXT: v_readlane_b32 s7, v61, 18
; SI-NEXT: v_cvt_f32_f16_e32 v52, s4
-; SI-NEXT: s_and_b32 s4, s40, 0xff
-; SI-NEXT: s_lshl_b32 s5, s72, 8
+; SI-NEXT: s_and_b32 s4, s9, 0xff
+; SI-NEXT: s_lshl_b32 s5, s7, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s69, v61, 35
+; SI-NEXT: v_readlane_b32 s68, v61, 17
; SI-NEXT: v_cvt_f32_f16_e32 v55, s4
-; SI-NEXT: s_and_b32 s4, s69, 0xff
-; SI-NEXT: s_lshl_b32 s5, s73, 8
+; SI-NEXT: s_and_b32 s4, s68, 0xff
+; SI-NEXT: s_lshl_b32 s5, s45, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s68, v61, 34
+; SI-NEXT: v_readlane_b32 s66, v61, 16
; SI-NEXT: v_cvt_f32_f16_e32 v54, s4
-; SI-NEXT: s_and_b32 s4, s68, 0xff
-; SI-NEXT: s_lshl_b32 s5, s89, 8
+; SI-NEXT: s_and_b32 s4, s66, 0xff
+; SI-NEXT: s_lshl_b32 s5, s15, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s66, v61, 33
+; SI-NEXT: v_readlane_b32 s53, v61, 15
; SI-NEXT: v_cvt_f32_f16_e32 v41, s4
-; SI-NEXT: s_and_b32 s4, s66, 0xff
+; SI-NEXT: s_and_b32 s4, s53, 0xff
; SI-NEXT: s_lshl_b32 s5, s16, 8
+; SI-NEXT: s_mov_b32 s61, s34
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s53, v61, 32
-; SI-NEXT: v_readlane_b32 s94, v61, 31
+; SI-NEXT: v_readlane_b32 s34, v61, 14
; SI-NEXT: v_cvt_f32_f16_e32 v40, s4
-; SI-NEXT: s_and_b32 s4, s53, 0xff
+; SI-NEXT: s_and_b32 s4, s34, 0xff
; SI-NEXT: s_lshl_b32 s5, s94, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s34, v61, 30
-; SI-NEXT: v_readlane_b32 s96, v61, 29
+; SI-NEXT: v_readlane_b32 s93, v61, 13
; SI-NEXT: v_cvt_f32_f16_e32 v43, s4
-; SI-NEXT: s_and_b32 s4, s34, 0xff
+; SI-NEXT: s_and_b32 s4, s93, 0xff
; SI-NEXT: s_lshl_b32 s5, s96, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s51, v61, 28
-; SI-NEXT: v_readlane_b32 s86, v61, 27
; SI-NEXT: v_cvt_f32_f16_e32 v42, s4
; SI-NEXT: s_and_b32 s4, s51, 0xff
; SI-NEXT: s_lshl_b32 s5, s86, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s87, v61, 26
-; SI-NEXT: v_readlane_b32 s84, v61, 25
; SI-NEXT: v_cvt_f32_f16_e32 v45, s4
; SI-NEXT: s_and_b32 s4, s87, 0xff
; SI-NEXT: s_lshl_b32 s5, s84, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s82, v61, 24
-; SI-NEXT: v_readlane_b32 s80, v61, 23
; SI-NEXT: v_cvt_f32_f16_e32 v44, s4
; SI-NEXT: s_and_b32 s4, s82, 0xff
; SI-NEXT: s_lshl_b32 s5, s80, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s83, v61, 22
-; SI-NEXT: v_readlane_b32 s49, v61, 21
; SI-NEXT: v_cvt_f32_f16_e32 v47, s4
; SI-NEXT: s_and_b32 s4, s83, 0xff
; SI-NEXT: s_lshl_b32 s5, s49, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s71, v61, 20
-; SI-NEXT: v_readlane_b32 s70, v61, 19
; SI-NEXT: v_cvt_f32_f16_e32 v46, s4
; SI-NEXT: s_and_b32 s4, s71, 0xff
; SI-NEXT: s_lshl_b32 s5, s70, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s65, v61, 18
-; SI-NEXT: v_readlane_b32 s54, v61, 17
; SI-NEXT: v_cvt_f32_f16_e32 v57, s4
; SI-NEXT: s_and_b32 s4, s65, 0xff
; SI-NEXT: s_lshl_b32 s5, s54, 8
-; SI-NEXT: s_mov_b32 s17, s19
-; SI-NEXT: s_mov_b32 s19, s50
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s67, v61, 16
-; SI-NEXT: v_readlane_b32 s50, v61, 15
+; SI-NEXT: v_readlane_b32 s50, v61, 12
; SI-NEXT: v_cvt_f32_f16_e32 v56, s4
; SI-NEXT: s_and_b32 s4, s67, 0xff
; SI-NEXT: s_lshl_b32 s5, s50, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s64, v61, 14
-; SI-NEXT: v_readlane_b32 s52, v61, 13
+; SI-NEXT: v_readlane_b32 s64, v61, 11
+; SI-NEXT: v_readlane_b32 s52, v61, 10
; SI-NEXT: v_cvt_f32_f16_e32 v59, s4
; SI-NEXT: s_and_b32 s4, s64, 0xff
; SI-NEXT: s_lshl_b32 s5, s52, 8
-; SI-NEXT: s_mov_b32 s23, s48
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s35, v61, 12
-; SI-NEXT: v_readlane_b32 s48, v61, 11
+; SI-NEXT: v_readlane_b32 s35, v61, 9
+; SI-NEXT: v_readlane_b32 s48, v61, 8
; SI-NEXT: v_cvt_f32_f16_e32 v58, s4
; SI-NEXT: s_and_b32 s4, s35, 0xff
; SI-NEXT: s_lshl_b32 s5, s48, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s30, v61, 10
-; SI-NEXT: v_readlane_b32 s39, v61, 9
+; SI-NEXT: v_readlane_b32 s30, v61, 7
+; SI-NEXT: v_readlane_b32 s39, v61, 6
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
; SI-NEXT: s_and_b32 s4, s30, 0xff
; SI-NEXT: s_lshl_b32 s5, s39, 8
-; SI-NEXT: s_mov_b32 s26, s37
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s37, v61, 8
-; SI-NEXT: v_readlane_b32 s75, v61, 7
+; SI-NEXT: v_readlane_b32 s37, v61, 5
+; SI-NEXT: v_readlane_b32 s75, v61, 4
; SI-NEXT: v_cvt_f32_f16_e32 v60, s4
; SI-NEXT: s_and_b32 s4, s37, 0xff
; SI-NEXT: s_lshl_b32 s5, s75, 8
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s92, v61, 6
-; SI-NEXT: v_readlane_b32 s77, v61, 5
+; SI-NEXT: v_readlane_b32 s92, v61, 3
+; SI-NEXT: v_readlane_b32 s77, v61, 2
; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
; SI-NEXT: s_and_b32 s4, s92, 0xff
; SI-NEXT: s_lshl_b32 s5, s77, 8
-; SI-NEXT: s_mov_b32 s28, s29
-; SI-NEXT: s_mov_b32 s29, s76
; SI-NEXT: s_or_b32 s4, s4, s5
-; SI-NEXT: v_readlane_b32 s78, v61, 4
-; SI-NEXT: v_readlane_b32 s76, v61, 3
+; SI-NEXT: v_readlane_b32 s78, v61, 1
+; SI-NEXT: v_readlane_b32 s76, v61, 0
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
; SI-NEXT: s_and_b32 s4, s78, 0xff
; SI-NEXT: s_lshl_b32 s5, s76, 8
@@ -178493,24 +178485,151 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_mov_b32 s99, s55
; SI-NEXT: s_mov_b32 s20, s88
; SI-NEXT: s_mov_b32 s24, s98
-; SI-NEXT: s_mov_b32 s59, s58
-; SI-NEXT: s_mov_b32 s56, s47
-; SI-NEXT: s_mov_b32 s46, s41
-; SI-NEXT: s_mov_b32 s12, s11
-; SI-NEXT: s_mov_b32 s11, s7
-; SI-NEXT: s_mov_b32 s7, s97
-; SI-NEXT: s_mov_b32 s97, s81
-; SI-NEXT: s_mov_b32 s81, s85
-; SI-NEXT: s_mov_b32 s6, s40
-; SI-NEXT: s_mov_b32 s40, s72
-; SI-NEXT: s_mov_b32 s45, s73
-; SI-NEXT: s_mov_b32 s15, s89
; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT: s_mov_b32 s55, s93
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: s_mov_b32 s55, s6
; SI-NEXT: s_mov_b32 s95, s91
; SI-NEXT: s_mov_b32 s31, s90
-; SI-NEXT: s_cbranch_execnz .LBB93_3
-; SI-NEXT: .LBB93_2: ; %cmp.true
+; SI-NEXT: s_mov_b32 s36, s8
+; SI-NEXT: s_mov_b32 s38, s27
+; SI-NEXT: s_mov_b32 s6, s7
+; SI-NEXT: s_mov_b32 s8, s9
+; SI-NEXT: s_mov_b32 s7, s10
+; SI-NEXT: s_mov_b32 s98, s89
+; SI-NEXT: s_mov_b32 s9, s11
+; SI-NEXT: s_mov_b32 s17, s72
+; SI-NEXT: s_mov_b32 s19, s62
+; SI-NEXT: s_mov_b32 s11, s13
+; SI-NEXT: s_mov_b32 s23, s63
+; SI-NEXT: s_mov_b32 s13, s18
+; SI-NEXT: s_mov_b32 s27, s22
+; SI-NEXT: s_branch .LBB93_3
+; SI-NEXT: .LBB93_2:
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: s_mov_b32 s61, s34
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: s_mov_b32 s24, s98
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: s_mov_b32 s20, s88
+; SI-NEXT: s_mov_b32 s99, s55
+; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: v_readlane_b32 s75, v61, 4
+; SI-NEXT: v_readlane_b32 s76, v61, 0
+; SI-NEXT: v_readlane_b32 s77, v61, 2
+; SI-NEXT: v_readlane_b32 s78, v61, 1
+; SI-NEXT: v_readlane_b32 s92, v61, 3
+; SI-NEXT: v_readlane_b32 s39, v61, 6
+; SI-NEXT: v_readlane_b32 s37, v61, 5
+; SI-NEXT: v_readlane_b32 s30, v61, 7
+; SI-NEXT: v_readlane_b32 s48, v61, 8
+; SI-NEXT: v_readlane_b32 s52, v61, 10
+; SI-NEXT: v_readlane_b32 s35, v61, 9
+; SI-NEXT: v_readlane_b32 s50, v61, 12
+; SI-NEXT: v_readlane_b32 s64, v61, 11
+; SI-NEXT: s_mov_b32 s55, s6
+; SI-NEXT: s_mov_b32 s95, s91
+; SI-NEXT: s_mov_b32 s31, s90
+; SI-NEXT: s_mov_b32 s36, s8
+; SI-NEXT: s_mov_b32 s38, s27
+; SI-NEXT: v_readlane_b32 s6, v61, 18
+; SI-NEXT: v_readlane_b32 s93, v61, 13
+; SI-NEXT: v_readlane_b32 s34, v61, 14
+; SI-NEXT: v_readlane_b32 s53, v61, 15
+; SI-NEXT: v_readlane_b32 s66, v61, 16
+; SI-NEXT: v_readlane_b32 s68, v61, 17
+; SI-NEXT: v_readlane_b32 s8, v61, 19
+; SI-NEXT: v_readlane_b32 s21, v61, 20
+; SI-NEXT: v_readlane_b32 s25, v61, 22
+; SI-NEXT: v_readlane_b32 s7, v61, 21
+; SI-NEXT: s_mov_b32 s98, s89
+; SI-NEXT: v_readlane_b32 s9, v61, 23
+; SI-NEXT: s_mov_b32 s17, s72
+; SI-NEXT: s_mov_b32 s19, s62
+; SI-NEXT: v_readlane_b32 s11, v61, 24
+; SI-NEXT: s_mov_b32 s23, s63
+; SI-NEXT: v_readlane_b32 s13, v61, 26
+; SI-NEXT: v_readlane_b32 s14, v61, 25
+; SI-NEXT: v_readlane_b32 s26, v61, 28
+; SI-NEXT: v_readlane_b32 s27, v61, 29
+; SI-NEXT: v_readlane_b32 s28, v61, 27
+; SI-NEXT: v_readlane_b32 s40, v61, 31
+; SI-NEXT: v_readlane_b32 s42, v61, 30
+; SI-NEXT: v_readlane_b32 s43, v61, 33
+; SI-NEXT: v_readlane_b32 s44, v61, 32
+; SI-NEXT: v_readlane_b32 s46, v61, 35
+; SI-NEXT: v_readlane_b32 s56, v61, 34
+; SI-NEXT: v_readlane_b32 s57, v61, 36
+; SI-NEXT: v_readlane_b32 s79, v61, 37
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr10
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $vgpr12
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $vgpr17
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $vgpr19
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $vgpr21
+; SI-NEXT: ; implicit-def: $vgpr20
+; SI-NEXT: ; implicit-def: $vgpr23
+; SI-NEXT: ; implicit-def: $vgpr22
+; SI-NEXT: ; implicit-def: $vgpr25
+; SI-NEXT: ; implicit-def: $vgpr24
+; SI-NEXT: ; implicit-def: $vgpr27
+; SI-NEXT: ; implicit-def: $vgpr26
+; SI-NEXT: ; implicit-def: $vgpr29
+; SI-NEXT: ; implicit-def: $vgpr28
+; SI-NEXT: ; implicit-def: $vgpr31
+; SI-NEXT: ; implicit-def: $vgpr30
+; SI-NEXT: ; implicit-def: $vgpr33
+; SI-NEXT: ; implicit-def: $vgpr32
+; SI-NEXT: ; implicit-def: $vgpr35
+; SI-NEXT: ; implicit-def: $vgpr34
+; SI-NEXT: ; implicit-def: $vgpr37
+; SI-NEXT: ; implicit-def: $vgpr36
+; SI-NEXT: ; implicit-def: $vgpr39
+; SI-NEXT: ; implicit-def: $vgpr38
+; SI-NEXT: ; implicit-def: $vgpr49
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $vgpr51
+; SI-NEXT: ; implicit-def: $vgpr50
+; SI-NEXT: ; implicit-def: $vgpr53
+; SI-NEXT: ; implicit-def: $vgpr52
+; SI-NEXT: ; implicit-def: $vgpr55
+; SI-NEXT: ; implicit-def: $vgpr54
+; SI-NEXT: ; implicit-def: $vgpr41
+; SI-NEXT: ; implicit-def: $vgpr40
+; SI-NEXT: ; implicit-def: $vgpr43
+; SI-NEXT: ; implicit-def: $vgpr42
+; SI-NEXT: ; implicit-def: $vgpr45
+; SI-NEXT: ; implicit-def: $vgpr44
+; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $vgpr46
+; SI-NEXT: ; implicit-def: $vgpr57
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $vgpr59
+; SI-NEXT: ; implicit-def: $vgpr58
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr60
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: .LBB93_3: ; %Flow
+; SI-NEXT: s_mov_b32 s88, s29
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB93_5
+; SI-NEXT: ; %bb.4: ; %cmp.true
; SI-NEXT: s_add_i32 s4, s78, 3
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s76, 8
@@ -178528,6 +178647,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_lshl_b32 s60, s39, 8
; SI-NEXT: s_or_b32 s60, s60, vcc_hi
; SI-NEXT: s_add_i32 vcc_hi, s35, 3
+; SI-NEXT: s_mov_b32 s90, s20
+; SI-NEXT: s_mov_b32 s20, s61
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
; SI-NEXT: s_lshl_b32 s61, s48, 8
; SI-NEXT: s_or_b32 s61, s61, vcc_hi
@@ -178560,228 +178681,209 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_lshl_b32 s76, s84, 8
; SI-NEXT: s_or_b32 s76, s76, vcc_hi
; SI-NEXT: s_add_i32 vcc_hi, s51, 3
-; SI-NEXT: s_add_i32 s93, s53, 3
+; SI-NEXT: s_add_i32 s89, s93, 3
+; SI-NEXT: s_add_i32 s93, s34, 3
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
; SI-NEXT: s_lshl_b32 s77, s86, 8
-; SI-NEXT: s_add_i32 s89, s34, 3
; SI-NEXT: s_and_b32 s93, s93, 0xff
; SI-NEXT: s_lshl_b32 s78, s94, 8
-; SI-NEXT: s_add_i32 s34, s66, 3
+; SI-NEXT: s_add_i32 s34, s53, 3
; SI-NEXT: s_or_b32 s77, s77, vcc_hi
; SI-NEXT: s_and_b32 s89, s89, 0xff
; SI-NEXT: s_lshl_b32 vcc_hi, s96, 8
; SI-NEXT: s_or_b32 s22, s78, s93
; SI-NEXT: s_and_b32 s93, s34, 0xff
; SI-NEXT: s_lshl_b32 s92, s16, 8
-; SI-NEXT: s_add_i32 s53, s68, 3
+; SI-NEXT: s_add_i32 s53, s66, 3
; SI-NEXT: s_or_b32 s89, vcc_hi, s89
; SI-NEXT: s_or_b32 s92, s92, s93
; SI-NEXT: s_and_b32 s93, s53, 0xff
; SI-NEXT: s_lshl_b32 vcc_hi, s15, 8
-; SI-NEXT: s_add_i32 s66, s69, 3
+; SI-NEXT: s_add_i32 s66, s68, 3
; SI-NEXT: s_or_b32 s93, vcc_hi, s93
; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff
; SI-NEXT: s_lshl_b32 s34, s45, 8
-; SI-NEXT: s_add_i32 s68, s6, 3
+; SI-NEXT: s_add_i32 s68, s8, 3
; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi
; SI-NEXT: s_and_b32 s34, s68, 0xff
-; SI-NEXT: s_lshl_b32 s39, s40, 8
+; SI-NEXT: s_lshl_b32 s39, s6, 8
; SI-NEXT: s_add_i32 s69, s81, 3
; SI-NEXT: s_or_b32 s34, s39, s34
; SI-NEXT: s_and_b32 s39, s69, 0xff
-; SI-NEXT: s_lshl_b32 s52, s21, 8
-; SI-NEXT: s_add_i32 s81, s7, 3
+; SI-NEXT: s_lshl_b32 s52, s85, 8
+; SI-NEXT: s_add_i32 s81, s21, 3
; SI-NEXT: s_or_b32 s39, s52, s39
; SI-NEXT: s_and_b32 s52, s81, 0xff
; SI-NEXT: s_lshl_b32 s53, s97, 8
-; SI-NEXT: s_add_i32 s85, s12, 3
+; SI-NEXT: s_add_i32 s85, s25, 3
; SI-NEXT: s_or_b32 s52, s53, s52
; SI-NEXT: s_and_b32 s53, s85, 0xff
-; SI-NEXT: s_lshl_b32 s64, s11, 8
-; SI-NEXT: s_add_i32 s97, s56, 3
+; SI-NEXT: s_lshl_b32 s64, s7, 8
+; SI-NEXT: s_add_i32 s97, s12, 3
; SI-NEXT: s_or_b32 s53, s64, s53
; SI-NEXT: s_and_b32 s64, s97, 0xff
-; SI-NEXT: s_lshl_b32 s66, s46, 8
-; SI-NEXT: s_add_i32 s21, s29, 3
+; SI-NEXT: s_lshl_b32 s66, s41, 8
+; SI-NEXT: s_add_i32 s21, s47, 3
+; SI-NEXT: v_readlane_b32 s16, v62, 12
+; SI-NEXT: s_mov_b32 s91, s24
; SI-NEXT: s_or_b32 s64, s66, s64
; SI-NEXT: s_and_b32 s21, s21, 0xff
-; SI-NEXT: s_lshl_b32 s66, s59, 8
-; SI-NEXT: s_add_i32 s25, s8, 3
+; SI-NEXT: s_lshl_b32 s66, s58, 8
+; SI-NEXT: s_add_i32 s25, s59, 3
+; SI-NEXT: s_add_i32 s24, s16, 3
+; SI-NEXT: v_readlane_b32 s16, v62, 11
; SI-NEXT: s_or_b32 s66, s66, s21
; SI-NEXT: s_and_b32 s21, s25, 0xff
-; SI-NEXT: s_lshl_b32 s6, s28, 8
-; SI-NEXT: s_add_i32 s29, s19, 3
+; SI-NEXT: s_lshl_b32 s6, s9, 8
+; SI-NEXT: s_add_i32 s29, s14, 3
+; SI-NEXT: s_add_i32 s7, s27, 3
+; SI-NEXT: s_add_i32 s27, s20, 3
+; SI-NEXT: s_add_i32 s20, s16, 3
; SI-NEXT: s_or_b32 s67, s6, s21
; SI-NEXT: s_and_b32 s6, s29, 0xff
-; SI-NEXT: s_lshl_b32 s18, s26, 8
-; SI-NEXT: s_add_i32 s28, s17, 3
-; SI-NEXT: s_or_b32 s68, s18, s6
-; SI-NEXT: s_and_b32 s6, s28, 0xff
-; SI-NEXT: s_lshl_b32 s18, s23, 8
-; SI-NEXT: s_or_b32 s69, s18, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 17
-; SI-NEXT: s_add_i32 s7, s6, 3
-; SI-NEXT: v_readlane_b32 s16, v62, 15
-; SI-NEXT: s_and_b32 s6, s7, 0xff
-; SI-NEXT: v_readlane_b32 s7, v62, 16
-; SI-NEXT: s_add_i32 s27, s16, 3
-; SI-NEXT: v_readlane_b32 s16, v62, 13
-; SI-NEXT: s_lshl_b32 s7, s7, 8
-; SI-NEXT: s_lshl_b32 s23, s16, 8
-; SI-NEXT: v_readlane_b32 s16, v62, 14
-; SI-NEXT: s_mov_b32 s91, s24
-; SI-NEXT: s_or_b32 s70, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 20
-; SI-NEXT: s_add_i32 s24, s16, 3
-; SI-NEXT: v_readlane_b32 s16, v62, 11
-; SI-NEXT: s_add_i32 s11, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 18
-; SI-NEXT: s_lshl_b32 s19, s16, 8
-; SI-NEXT: v_readlane_b32 s16, v62, 12
-; SI-NEXT: s_mov_b32 s90, s20
-; SI-NEXT: s_and_b32 s6, s11, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 8
-; SI-NEXT: s_add_i32 s20, s16, 3
-; SI-NEXT: v_readlane_b32 s16, v62, 9
-; SI-NEXT: s_or_b32 s71, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 22
+; SI-NEXT: s_lshl_b32 s18, s11, 8
+; SI-NEXT: s_add_i32 s28, s28, 3
; SI-NEXT: s_and_b32 s20, s20, 0xff
-; SI-NEXT: s_lshl_b32 s17, s16, 8
+; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: v_readlane_b32 s16, v62, 10
-; SI-NEXT: s_add_i32 s12, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 19
+; SI-NEXT: s_or_b32 s68, s18, s6
+; SI-NEXT: s_and_b32 s6, s28, 0xff
+; SI-NEXT: s_lshl_b32 s18, s13, 8
+; SI-NEXT: s_and_b32 s24, s24, 0xff
+; SI-NEXT: s_lshl_b32 s19, s19, 8
; SI-NEXT: s_or_b32 s17, s17, s20
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s20, v62, 8
-; SI-NEXT: s_and_b32 s6, s12, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: v_readlane_b32 s20, v62, 9
+; SI-NEXT: s_or_b32 s69, s18, s6
+; SI-NEXT: s_and_b32 s6, s7, 0xff
+; SI-NEXT: s_lshl_b32 s7, s26, 8
+; SI-NEXT: s_add_i32 s11, s40, 3
+; SI-NEXT: s_or_b32 s19, s19, s24
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s20, s20, 8
-; SI-NEXT: s_or_b32 s81, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 23
-; SI-NEXT: s_and_b32 s24, s24, 0xff
+; SI-NEXT: s_add_i32 s98, s98, 3
+; SI-NEXT: v_readlane_b32 s24, v62, 8
+; SI-NEXT: s_or_b32 s70, s7, s6
+; SI-NEXT: s_and_b32 s6, s11, 0xff
+; SI-NEXT: s_lshl_b32 s7, s42, 8
+; SI-NEXT: s_add_i32 s12, s43, 3
; SI-NEXT: s_or_b32 s16, s20, s16
-; SI-NEXT: v_readlane_b32 s20, v62, 7
-; SI-NEXT: s_add_i32 s14, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 21
-; SI-NEXT: s_or_b32 s19, s19, s24
-; SI-NEXT: s_add_i32 s98, s20, 3
-; SI-NEXT: v_readlane_b32 s24, v62, 6
-; SI-NEXT: s_and_b32 s6, s14, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 8
; SI-NEXT: s_and_b32 s20, s98, 0xff
; SI-NEXT: s_lshl_b32 s24, s24, 8
-; SI-NEXT: s_or_b32 s83, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 26
+; SI-NEXT: s_or_b32 s71, s7, s6
+; SI-NEXT: s_and_b32 s6, s12, 0xff
+; SI-NEXT: s_lshl_b32 s7, s44, 8
+; SI-NEXT: s_add_i32 s14, s46, 3
; SI-NEXT: s_and_b32 s27, s27, 0xff
+; SI-NEXT: s_lshl_b32 s23, s23, 8
; SI-NEXT: s_or_b32 s20, s24, s20
-; SI-NEXT: v_readlane_b32 s24, v62, 5
-; SI-NEXT: s_add_i32 s41, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 24
+; SI-NEXT: v_readlane_b32 s24, v62, 7
+; SI-NEXT: s_or_b32 s81, s7, s6
+; SI-NEXT: s_and_b32 s6, s14, 0xff
+; SI-NEXT: s_lshl_b32 s7, s56, 8
+; SI-NEXT: s_add_i32 s41, s79, 3
; SI-NEXT: s_or_b32 s23, s23, s27
; SI-NEXT: s_add_i32 s86, s24, 3
-; SI-NEXT: v_readlane_b32 s27, v62, 4
+; SI-NEXT: v_readlane_b32 s27, v62, 6
+; SI-NEXT: s_or_b32 s83, s7, s6
; SI-NEXT: s_and_b32 s6, s41, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_lshl_b32 s7, s57, 8
; SI-NEXT: s_and_b32 s24, s86, 0xff
; SI-NEXT: s_lshl_b32 s27, s27, 8
; SI-NEXT: s_or_b32 s85, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 28
+; SI-NEXT: s_add_i32 s46, s88, 3
+; SI-NEXT: v_readlane_b32 s7, v62, 13
; SI-NEXT: s_or_b32 s24, s27, s24
-; SI-NEXT: v_readlane_b32 s27, v62, 3
-; SI-NEXT: s_add_i32 s46, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 25
-; SI-NEXT: s_add_i32 s12, s73, 0x300
-; SI-NEXT: s_add_i32 s82, s27, 3
-; SI-NEXT: v_readlane_b32 s73, v62, 2
+; SI-NEXT: v_readlane_b32 s27, v62, 5
; SI-NEXT: s_and_b32 s6, s46, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_add_i32 s12, s73, 0x300
+; SI-NEXT: s_add_i32 s82, s27, 3
+; SI-NEXT: v_readlane_b32 s73, v62, 4
+; SI-NEXT: s_or_b32 s96, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 14
; SI-NEXT: s_and_b32 s27, s82, 0xff
; SI-NEXT: s_lshl_b32 s73, s73, 8
-; SI-NEXT: s_or_b32 s96, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 31
-; SI-NEXT: s_or_b32 s27, s73, s27
-; SI-NEXT: v_readlane_b32 s73, v62, 1
; SI-NEXT: s_add_i32 s47, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 27
-; SI-NEXT: s_add_i32 s13, s74, 0x300
-; SI-NEXT: s_add_i32 s65, s73, 3
-; SI-NEXT: v_readlane_b32 s74, v62, 0
+; SI-NEXT: v_readlane_b32 s7, v62, 16
+; SI-NEXT: s_or_b32 s27, s73, s27
+; SI-NEXT: v_readlane_b32 s73, v62, 3
; SI-NEXT: s_and_b32 s6, s47, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_add_i32 s13, s74, 0x300
+; SI-NEXT: s_add_i32 s65, s73, 3
+; SI-NEXT: v_readlane_b32 s74, v62, 2
+; SI-NEXT: s_or_b32 s97, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 15
; SI-NEXT: s_and_b32 s73, s65, 0xff
; SI-NEXT: s_lshl_b32 s74, s74, 8
-; SI-NEXT: s_or_b32 s97, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 34
-; SI-NEXT: s_or_b32 s73, s74, s73
-; SI-NEXT: v_readlane_b32 s74, v61, 63
; SI-NEXT: s_add_i32 s56, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 29
-; SI-NEXT: s_add_i32 s14, s75, 0x300
-; SI-NEXT: s_add_i32 s54, s74, 3
-; SI-NEXT: v_readlane_b32 s75, v61, 62
+; SI-NEXT: v_readlane_b32 s7, v62, 19
+; SI-NEXT: s_or_b32 s73, s74, s73
+; SI-NEXT: v_readlane_b32 s74, v62, 1
; SI-NEXT: s_and_b32 s6, s56, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_add_i32 s14, s75, 0x300
+; SI-NEXT: s_add_i32 s54, s74, 3
+; SI-NEXT: v_readlane_b32 s75, v62, 0
+; SI-NEXT: s_or_b32 s63, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 18
; SI-NEXT: s_and_b32 s74, s54, 0xff
; SI-NEXT: s_lshl_b32 s75, s75, 8
-; SI-NEXT: s_or_b32 s63, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 36
-; SI-NEXT: s_or_b32 s74, s75, s74
-; SI-NEXT: v_readlane_b32 s75, v61, 61
; SI-NEXT: s_add_i32 s58, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 30
-; SI-NEXT: s_add_i32 s15, s76, 0x300
-; SI-NEXT: s_add_i32 s50, s75, 3
-; SI-NEXT: v_readlane_b32 s76, v61, 60
+; SI-NEXT: v_readlane_b32 s7, v62, 21
+; SI-NEXT: s_or_b32 s74, s75, s74
+; SI-NEXT: v_readlane_b32 s75, v61, 63
; SI-NEXT: s_and_b32 s6, s58, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_add_i32 s15, s76, 0x300
+; SI-NEXT: s_add_i32 s50, s75, 3
+; SI-NEXT: v_readlane_b32 s76, v61, 62
+; SI-NEXT: s_or_b32 s79, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 20
; SI-NEXT: s_and_b32 s75, s50, 0xff
; SI-NEXT: s_lshl_b32 s76, s76, 8
-; SI-NEXT: s_or_b32 s79, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 37
-; SI-NEXT: s_or_b32 s75, s76, s75
-; SI-NEXT: v_readlane_b32 s76, v61, 59
; SI-NEXT: s_add_i32 s59, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 33
-; SI-NEXT: s_add_i32 s18, s77, 0x300
-; SI-NEXT: s_add_i32 s48, s76, 3
-; SI-NEXT: v_readlane_b32 s77, v61, 58
+; SI-NEXT: v_readlane_b32 s7, v62, 23
+; SI-NEXT: s_or_b32 s75, s76, s75
+; SI-NEXT: v_readlane_b32 s76, v61, 61
; SI-NEXT: s_and_b32 s6, s59, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: s_add_i32 s18, s77, 0x300
+; SI-NEXT: s_add_i32 s48, s76, 3
+; SI-NEXT: v_readlane_b32 s77, v61, 60
+; SI-NEXT: s_or_b32 s78, s7, s6
+; SI-NEXT: v_readlane_b32 s6, v62, 17
; SI-NEXT: s_and_b32 s76, s48, 0xff
; SI-NEXT: s_lshl_b32 s77, s77, 8
-; SI-NEXT: s_or_b32 s78, s7, s6
-; SI-NEXT: v_readlane_b32 s6, v62, 35
-; SI-NEXT: s_or_b32 s76, s77, s76
-; SI-NEXT: v_readlane_b32 s77, v61, 57
; SI-NEXT: s_add_i32 s57, s6, 3
-; SI-NEXT: v_readlane_b32 s7, v62, 32
+; SI-NEXT: v_readlane_b32 s7, v62, 22
+; SI-NEXT: s_or_b32 s76, s77, s76
+; SI-NEXT: v_readlane_b32 s77, v61, 59
+; SI-NEXT: s_and_b32 s6, s57, 0xff
+; SI-NEXT: s_lshl_b32 s7, s7, 8
; SI-NEXT: s_add_i32 s11, s72, 0x300
; SI-NEXT: s_add_i32 s72, s79, 0x300
; SI-NEXT: s_add_i32 s37, s77, 3
-; SI-NEXT: v_readlane_b32 s79, v61, 56
-; SI-NEXT: s_and_b32 s6, s57, 0xff
-; SI-NEXT: s_lshl_b32 s7, s7, 8
+; SI-NEXT: v_readlane_b32 s79, v61, 58
+; SI-NEXT: s_or_b32 s88, s7, s6
; SI-NEXT: s_and_b32 s77, s37, 0xff
; SI-NEXT: s_lshl_b32 s79, s79, 8
-; SI-NEXT: s_or_b32 s88, s7, s6
-; SI-NEXT: s_or_b32 s77, s79, s77
-; SI-NEXT: v_readlane_b32 s79, v61, 55
+; SI-NEXT: s_add_i32 s35, s38, 3
; SI-NEXT: s_add_i32 s21, s89, 0x300
; SI-NEXT: s_add_i32 s89, s88, 0x300
-; SI-NEXT: s_add_i32 s35, s79, 3
-; SI-NEXT: v_readlane_b32 s88, v61, 54
+; SI-NEXT: s_or_b32 s77, s79, s77
; SI-NEXT: s_and_b32 s79, s35, 0xff
-; SI-NEXT: s_lshl_b32 s88, s88, 8
+; SI-NEXT: s_lshl_b32 s88, s36, 8
; SI-NEXT: s_or_b32 s79, s88, s79
-; SI-NEXT: v_readlane_b32 s88, v61, 53
+; SI-NEXT: v_readlane_b32 s88, v61, 57
; SI-NEXT: s_add_i32 s25, s92, 0x300
; SI-NEXT: s_add_i32 s30, s88, 3
-; SI-NEXT: v_readlane_b32 s92, v61, 52
+; SI-NEXT: v_readlane_b32 s92, v61, 56
; SI-NEXT: s_and_b32 s88, s30, 0xff
; SI-NEXT: s_lshl_b32 s92, s92, 8
; SI-NEXT: s_or_b32 s88, s92, s88
-; SI-NEXT: v_readlane_b32 s92, v61, 51
+; SI-NEXT: v_readlane_b32 s92, v61, 55
; SI-NEXT: s_add_i32 s94, s92, 3
; SI-NEXT: s_and_b32 s92, s94, 0xff
; SI-NEXT: s_lshl_b32 s91, s91, 8
@@ -178790,52 +178892,52 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_and_b32 s90, s90, 0xff
; SI-NEXT: s_lshl_b32 s92, s31, 8
; SI-NEXT: s_or_b32 s90, s92, s90
-; SI-NEXT: v_readlane_b32 s92, v61, 50
+; SI-NEXT: v_readlane_b32 s92, v61, 54
; SI-NEXT: s_add_i32 s92, s92, 3
; SI-NEXT: s_add_i32 s26, s93, 0x300
; SI-NEXT: s_and_b32 s92, s92, 0xff
; SI-NEXT: s_lshl_b32 s93, s95, 8
; SI-NEXT: s_or_b32 s92, s93, s92
-; SI-NEXT: v_readlane_b32 s93, v61, 49
+; SI-NEXT: v_readlane_b32 s93, v61, 53
; SI-NEXT: s_add_i32 s93, s93, 3
; SI-NEXT: s_and_b32 s93, s93, 0xff
; SI-NEXT: s_lshl_b32 s94, s55, 8
; SI-NEXT: s_or_b32 s93, s94, s93
-; SI-NEXT: v_readlane_b32 s94, v61, 48
+; SI-NEXT: v_readlane_b32 s94, v61, 51
; SI-NEXT: s_add_i32 s94, s94, 3
; SI-NEXT: s_and_b32 s94, s94, 0xff
; SI-NEXT: s_lshl_b32 s95, s99, 8
; SI-NEXT: s_or_b32 s94, s95, s94
-; SI-NEXT: v_readlane_b32 s95, v61, 1
+; SI-NEXT: v_readlane_b32 s95, v61, 52
; SI-NEXT: s_add_i32 s95, s95, 3
-; SI-NEXT: v_readlane_b32 s30, v61, 0
+; SI-NEXT: v_readlane_b32 s30, v61, 50
; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300
; SI-NEXT: s_and_b32 s95, s95, 0xff
; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8
-; SI-NEXT: v_readlane_b32 s30, v61, 47
+; SI-NEXT: v_readlane_b32 s30, v61, 49
; SI-NEXT: s_or_b32 s95, vcc_lo, s95
; SI-NEXT: s_add_i32 vcc_lo, s30, 3
-; SI-NEXT: v_readlane_b32 s30, v61, 2
+; SI-NEXT: v_readlane_b32 s30, v61, 48
; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300
; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff
; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8
-; SI-NEXT: v_readlane_b32 s30, v61, 46
+; SI-NEXT: v_readlane_b32 s30, v61, 47
; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo
; SI-NEXT: s_add_i32 vcc_hi, s30, 3
-; SI-NEXT: v_readlane_b32 s30, v61, 45
+; SI-NEXT: v_readlane_b32 s30, v61, 46
; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff
; SI-NEXT: s_lshl_b32 s30, s30, 8
; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi
-; SI-NEXT: v_readlane_b32 s30, v61, 44
+; SI-NEXT: v_readlane_b32 s30, v61, 45
; SI-NEXT: s_add_i32 s30, s30, 3
-; SI-NEXT: v_readlane_b32 s31, v61, 43
+; SI-NEXT: v_readlane_b32 s31, v61, 44
; SI-NEXT: s_and_b32 s30, s30, 0xff
; SI-NEXT: s_lshl_b32 s31, s31, 8
; SI-NEXT: s_or_b32 s30, s31, s30
-; SI-NEXT: v_readlane_b32 s31, v61, 42
+; SI-NEXT: v_readlane_b32 s31, v61, 43
; SI-NEXT: s_add_i32 s29, s34, 0x300
; SI-NEXT: s_add_i32 s31, s31, 3
-; SI-NEXT: v_readlane_b32 s34, v61, 41
+; SI-NEXT: v_readlane_b32 s34, v61, 42
; SI-NEXT: s_and_b32 s31, s31, 0xff
; SI-NEXT: s_lshl_b32 s34, s34, 8
; SI-NEXT: s_or_b32 s31, s34, s31
@@ -178843,25 +178945,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v1, s31
; SI-NEXT: s_addk_i32 s30, 0x300
; SI-NEXT: s_addk_i32 vcc_hi, 0x300
-; SI-NEXT: v_readlane_b32 s34, v61, 40
+; SI-NEXT: v_readlane_b32 s34, v61, 41
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, s30
; SI-NEXT: s_add_i32 s34, s34, 3
-; SI-NEXT: v_readlane_b32 s35, v61, 39
+; SI-NEXT: v_readlane_b32 s35, v61, 40
; SI-NEXT: s_and_b32 s34, s34, 0xff
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi
; SI-NEXT: s_lshl_b32 s35, s35, 8
-; SI-NEXT: s_addk_i32 vcc_lo, 0x300
+; SI-NEXT: s_addk_i32 s95, 0x300
; SI-NEXT: s_or_b32 s34, s35, s34
-; SI-NEXT: v_readlane_b32 s35, v61, 38
+; SI-NEXT: v_readlane_b32 s35, v61, 39
; SI-NEXT: s_add_i32 s35, s35, 3
-; SI-NEXT: v_readlane_b32 s36, v61, 37
+; SI-NEXT: v_readlane_b32 s36, v61, 38
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_lo
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s95
; SI-NEXT: s_and_b32 s35, s35, 0xff
; SI-NEXT: s_lshl_b32 s36, s36, 8
; SI-NEXT: s_or_b32 s35, s36, s35
@@ -178908,13 +179010,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_addk_i32 s92, 0x300
; SI-NEXT: s_addk_i32 s93, 0x300
; SI-NEXT: s_addk_i32 s94, 0x300
-; SI-NEXT: s_addk_i32 s95, 0x300
+; SI-NEXT: s_addk_i32 vcc_lo, 0x300
; SI-NEXT: s_addk_i32 s34, 0x300
; SI-NEXT: s_addk_i32 s35, 0x300
; SI-NEXT: v_cvt_f32_f16_e32 v6, s35
; SI-NEXT: v_cvt_f32_f16_e32 v5, s34
+; SI-NEXT: v_cvt_f32_f16_e32 v7, vcc_lo
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s95
; SI-NEXT: v_cvt_f32_f16_e32 v10, s94
; SI-NEXT: v_cvt_f32_f16_e32 v8, s93
; SI-NEXT: v_cvt_f32_f16_e32 v11, s92
@@ -178973,7 +179075,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
; SI-NEXT: v_cvt_f32_f16_e32 v3, s5
; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT: .LBB93_3: ; %end
+; SI-NEXT: .LBB93_5: ; %end
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -179028,22 +179130,22 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0
; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v7
+; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0
; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v7
-; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_or_b32_e32 v5, v6, v5
; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
@@ -179260,134 +179362,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB93_4:
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: s_mov_b32 s17, s19
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: s_mov_b32 s19, s50
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: s_mov_b32 s23, s48
-; SI-NEXT: s_mov_b32 s26, s37
-; SI-NEXT: s_mov_b32 s28, s29
-; SI-NEXT: s_mov_b32 s29, s76
-; SI-NEXT: s_mov_b32 s59, s58
-; SI-NEXT: s_mov_b32 s56, s47
-; SI-NEXT: s_mov_b32 s46, s41
-; SI-NEXT: s_mov_b32 s12, s11
-; SI-NEXT: s_mov_b32 s11, s7
-; SI-NEXT: s_mov_b32 s7, s97
-; SI-NEXT: s_mov_b32 s97, s81
-; SI-NEXT: s_mov_b32 s81, s85
-; SI-NEXT: s_mov_b32 s6, s40
-; SI-NEXT: s_mov_b32 s40, s72
-; SI-NEXT: s_mov_b32 s45, s73
-; SI-NEXT: s_mov_b32 s15, s89
-; SI-NEXT: s_mov_b32 s24, s98
-; SI-NEXT: s_mov_b32 s20, s88
-; SI-NEXT: s_mov_b32 s99, s55
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_readlane_b32 s75, v61, 7
-; SI-NEXT: v_readlane_b32 s76, v61, 3
-; SI-NEXT: v_readlane_b32 s77, v61, 5
-; SI-NEXT: v_readlane_b32 s78, v61, 4
-; SI-NEXT: v_readlane_b32 s92, v61, 6
-; SI-NEXT: v_readlane_b32 s39, v61, 9
-; SI-NEXT: v_readlane_b32 s37, v61, 8
-; SI-NEXT: v_readlane_b32 s30, v61, 10
-; SI-NEXT: v_readlane_b32 s48, v61, 11
-; SI-NEXT: v_readlane_b32 s52, v61, 13
-; SI-NEXT: v_readlane_b32 s35, v61, 12
-; SI-NEXT: v_readlane_b32 s50, v61, 15
-; SI-NEXT: v_readlane_b32 s64, v61, 14
-; SI-NEXT: v_readlane_b32 s54, v61, 17
-; SI-NEXT: v_readlane_b32 s67, v61, 16
-; SI-NEXT: v_readlane_b32 s65, v61, 18
-; SI-NEXT: v_readlane_b32 s70, v61, 19
-; SI-NEXT: v_readlane_b32 s49, v61, 21
-; SI-NEXT: v_readlane_b32 s71, v61, 20
-; SI-NEXT: v_readlane_b32 s80, v61, 23
-; SI-NEXT: v_readlane_b32 s83, v61, 22
-; SI-NEXT: v_readlane_b32 s84, v61, 25
-; SI-NEXT: v_readlane_b32 s82, v61, 24
-; SI-NEXT: v_readlane_b32 s87, v61, 26
-; SI-NEXT: v_readlane_b32 s86, v61, 27
-; SI-NEXT: v_readlane_b32 s96, v61, 29
-; SI-NEXT: v_readlane_b32 s51, v61, 28
-; SI-NEXT: s_mov_b32 s55, s93
-; SI-NEXT: s_mov_b32 s95, s91
-; SI-NEXT: v_readlane_b32 s94, v61, 31
-; SI-NEXT: s_mov_b32 s31, s90
-; SI-NEXT: v_readlane_b32 s34, v61, 30
-; SI-NEXT: v_readlane_b32 s53, v61, 32
-; SI-NEXT: v_readlane_b32 s66, v61, 33
-; SI-NEXT: v_readlane_b32 s68, v61, 34
-; SI-NEXT: v_readlane_b32 s69, v61, 35
-; SI-NEXT: v_readlane_b32 s8, v61, 36
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr7
-; SI-NEXT: ; implicit-def: $vgpr10
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $vgpr12
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $vgpr17
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $vgpr19
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $vgpr21
-; SI-NEXT: ; implicit-def: $vgpr20
-; SI-NEXT: ; implicit-def: $vgpr23
-; SI-NEXT: ; implicit-def: $vgpr22
-; SI-NEXT: ; implicit-def: $vgpr25
-; SI-NEXT: ; implicit-def: $vgpr24
-; SI-NEXT: ; implicit-def: $vgpr27
-; SI-NEXT: ; implicit-def: $vgpr26
-; SI-NEXT: ; implicit-def: $vgpr29
-; SI-NEXT: ; implicit-def: $vgpr28
-; SI-NEXT: ; implicit-def: $vgpr31
-; SI-NEXT: ; implicit-def: $vgpr30
-; SI-NEXT: ; implicit-def: $vgpr33
-; SI-NEXT: ; implicit-def: $vgpr32
-; SI-NEXT: ; implicit-def: $vgpr35
-; SI-NEXT: ; implicit-def: $vgpr34
-; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: ; implicit-def: $vgpr36
-; SI-NEXT: ; implicit-def: $vgpr39
-; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: ; implicit-def: $vgpr49
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $vgpr51
-; SI-NEXT: ; implicit-def: $vgpr50
-; SI-NEXT: ; implicit-def: $vgpr53
-; SI-NEXT: ; implicit-def: $vgpr52
-; SI-NEXT: ; implicit-def: $vgpr55
-; SI-NEXT: ; implicit-def: $vgpr54
-; SI-NEXT: ; implicit-def: $vgpr41
-; SI-NEXT: ; implicit-def: $vgpr40
-; SI-NEXT: ; implicit-def: $vgpr43
-; SI-NEXT: ; implicit-def: $vgpr42
-; SI-NEXT: ; implicit-def: $vgpr45
-; SI-NEXT: ; implicit-def: $vgpr44
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $vgpr46
-; SI-NEXT: ; implicit-def: $vgpr57
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: ; implicit-def: $vgpr58
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr60
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: s_branch .LBB93_2
;
; VI-LABEL: bitcast_v128i8_to_v64f16_scalar:
; VI: ; %bb.0:
@@ -186210,24 +186184,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -186248,10 +186204,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
+; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr53
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr36
-; GFX9-NEXT: ; implicit-def: $vgpr57
; GFX9-NEXT: ; implicit-def: $vgpr35
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: ; implicit-def: $vgpr63
@@ -186259,7 +186214,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: ; implicit-def: $vgpr60
-; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr37
; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr49
@@ -186268,13 +186222,33 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr48
; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr56
-; GFX9-NEXT: ; implicit-def: $vgpr42
-; GFX9-NEXT: ; implicit-def: $vgpr41
-; GFX9-NEXT: ; implicit-def: $vgpr40
+; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr52
; GFX9-NEXT: ; implicit-def: $vgpr51
+; GFX9-NEXT: ; implicit-def: $vgpr59
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr40
+; GFX9-NEXT: ; implicit-def: $vgpr57
+; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; implicit-def: $vgpr50
+; GFX9-NEXT: ; implicit-def: $vgpr42
+; GFX9-NEXT: ; implicit-def: $vgpr41
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -186282,9 +186256,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
@@ -186308,7 +186285,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(33)
+; GFX9-NEXT: s_waitcnt vmcnt(43)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -186367,180 +186344,195 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB94_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[15:16]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[13:14]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[7:8]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(62)
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(62)
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20]
-; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
+; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15
; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v29
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
@@ -186550,9 +186542,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17
; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2]
-; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29
; GFX9-NEXT: .LBB94_2: ; %Flow
-; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v43, v50
+; GFX9-NEXT: v_mov_b32_e32 v50, v40
+; GFX9-NEXT: v_mov_b32_e32 v40, v55
+; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB94_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: s_movk_i32 s6, 0x200
@@ -186565,12 +186561,36 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14]
-; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: s_waitcnt vmcnt(34)
-; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12]
+; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10]
+; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8]
+; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6]
+; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(52)
+; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0]
@@ -186602,164 +186622,149 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22]
+; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8
-; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6
-; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6
; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4
; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4
; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2
+; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
-; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2]
-; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15
; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
@@ -186776,41 +186781,50 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61
; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38
-; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60
-; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59
-; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
-; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -186819,84 +186833,103 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49
; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42
-; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -186906,16 +186939,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -186925,23 +186958,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60
+; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53
; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
@@ -186949,14 +186975,18 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33
-; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
@@ -186965,11 +186995,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -186978,10 +187008,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -186991,11 +187021,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -187004,10 +187034,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -187017,11 +187047,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -187030,10 +187060,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -187043,11 +187073,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -187056,10 +187086,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -187069,53 +187099,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54
; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v59
; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58
; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v57
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -200170,36 +200181,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_writelane_b32 v41, s86, 30
; SI-NEXT: v_writelane_b32 v41, s87, 31
; SI-NEXT: v_writelane_b32 v41, s96, 32
-; SI-NEXT: v_writelane_b32 v41, s97, 33
-; SI-NEXT: v_writelane_b32 v41, s98, 34
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160
; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT: v_readfirstlane_b32 s39, v26
+; SI-NEXT: v_readfirstlane_b32 s56, v12
; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT: v_readfirstlane_b32 s47, v12
-; SI-NEXT: v_writelane_b32 v42, s39, 0
-; SI-NEXT: v_readfirstlane_b32 s56, v11
-; SI-NEXT: v_writelane_b32 v42, s47, 1
-; SI-NEXT: v_readfirstlane_b32 s48, v24
-; SI-NEXT: v_writelane_b32 v42, s56, 2
-; SI-NEXT: v_readfirstlane_b32 s49, v23
-; SI-NEXT: v_writelane_b32 v42, s48, 3
-; SI-NEXT: v_readfirstlane_b32 s50, v21
-; SI-NEXT: v_writelane_b32 v42, s49, 4
-; SI-NEXT: v_readfirstlane_b32 s51, v22
-; SI-NEXT: v_writelane_b32 v42, s50, 5
-; SI-NEXT: v_writelane_b32 v42, s51, 6
-; SI-NEXT: v_readfirstlane_b32 s57, v20
-; SI-NEXT: v_readfirstlane_b32 s58, v19
-; SI-NEXT: v_readfirstlane_b32 s64, v29
-; SI-NEXT: v_readfirstlane_b32 s65, v30
-; SI-NEXT: v_readfirstlane_b32 s59, v28
-; SI-NEXT: v_readfirstlane_b32 s60, v27
-; SI-NEXT: v_readfirstlane_b32 s11, v1
+; SI-NEXT: v_readfirstlane_b32 s57, v11
+; SI-NEXT: v_writelane_b32 v42, s56, 0
+; SI-NEXT: v_readfirstlane_b32 s50, v24
+; SI-NEXT: v_writelane_b32 v42, s57, 1
+; SI-NEXT: v_readfirstlane_b32 s51, v23
+; SI-NEXT: v_writelane_b32 v42, s50, 2
+; SI-NEXT: v_readfirstlane_b32 s52, v21
+; SI-NEXT: v_writelane_b32 v42, s51, 3
+; SI-NEXT: v_readfirstlane_b32 s53, v22
+; SI-NEXT: v_writelane_b32 v42, s52, 4
+; SI-NEXT: v_writelane_b32 v42, s53, 5
+; SI-NEXT: v_readfirstlane_b32 s58, v20
+; SI-NEXT: v_readfirstlane_b32 s59, v19
+; SI-NEXT: v_readfirstlane_b32 s67, v29
+; SI-NEXT: v_readfirstlane_b32 s71, v30
+; SI-NEXT: v_writelane_b32 v41, s97, 33
+; SI-NEXT: v_readfirstlane_b32 s60, v28
+; SI-NEXT: v_readfirstlane_b32 s61, v27
+; SI-NEXT: v_writelane_b32 v41, s98, 34
+; SI-NEXT: v_readfirstlane_b32 s14, v1
+; SI-NEXT: v_readfirstlane_b32 s15, v2
+; SI-NEXT: v_readfirstlane_b32 s21, v9
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 14
@@ -200221,48 +200232,48 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v35
; SI-NEXT: v_writelane_b32 v43, s4, 18
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s44, v36
-; SI-NEXT: v_readfirstlane_b32 s90, v37
+; SI-NEXT: v_readfirstlane_b32 s4, v36
+; SI-NEXT: v_writelane_b32 v43, s4, 19
+; SI-NEXT: v_readfirstlane_b32 s24, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: v_readfirstlane_b32 s6, v38
-; SI-NEXT: v_readfirstlane_b32 s12, v2
-; SI-NEXT: v_readfirstlane_b32 s13, v9
-; SI-NEXT: v_readfirstlane_b32 s14, v10
-; SI-NEXT: v_readfirstlane_b32 s15, v8
-; SI-NEXT: v_readfirstlane_b32 s18, v7
-; SI-NEXT: v_readfirstlane_b32 s21, v5
-; SI-NEXT: v_readfirstlane_b32 s22, v6
-; SI-NEXT: v_readfirstlane_b32 s40, v17
-; SI-NEXT: v_readfirstlane_b32 s41, v18
-; SI-NEXT: v_readfirstlane_b32 s42, v4
-; SI-NEXT: v_readfirstlane_b32 s43, v3
-; SI-NEXT: v_readfirstlane_b32 s76, v16
-; SI-NEXT: v_readfirstlane_b32 s77, v15
-; SI-NEXT: v_readfirstlane_b32 s38, v25
+; SI-NEXT: v_readfirstlane_b32 s9, v38
+; SI-NEXT: v_readfirstlane_b32 s22, v10
+; SI-NEXT: v_readfirstlane_b32 s40, v8
+; SI-NEXT: v_readfirstlane_b32 s41, v7
+; SI-NEXT: v_readfirstlane_b32 s42, v5
+; SI-NEXT: v_readfirstlane_b32 s43, v6
+; SI-NEXT: v_readfirstlane_b32 s76, v17
+; SI-NEXT: v_readfirstlane_b32 s77, v18
+; SI-NEXT: v_readfirstlane_b32 s46, v4
+; SI-NEXT: v_readfirstlane_b32 s47, v3
+; SI-NEXT: v_readfirstlane_b32 s38, v13
+; SI-NEXT: v_readfirstlane_b32 s39, v14
+; SI-NEXT: v_readfirstlane_b32 s48, v25
+; SI-NEXT: v_readfirstlane_b32 s49, v26
; SI-NEXT: v_writelane_b32 v41, s99, 35
; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_readfirstlane_b32 s93, v55
+; SI-NEXT: v_readfirstlane_b32 s95, v55
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s95, v40
+; SI-NEXT: v_readfirstlane_b32 s93, v40
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v43, s4, 19
+; SI-NEXT: v_writelane_b32 v43, s4, 20
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v43, s4, 20
+; SI-NEXT: v_writelane_b32 v43, s4, 21
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v48
-; SI-NEXT: v_writelane_b32 v43, s4, 21
+; SI-NEXT: v_writelane_b32 v43, s4, 22
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v49
-; SI-NEXT: v_writelane_b32 v43, s4, 22
+; SI-NEXT: v_writelane_b32 v43, s4, 23
; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_readfirstlane_b32 s4, v50
-; SI-NEXT: v_writelane_b32 v43, s4, 23
+; SI-NEXT: v_writelane_b32 v43, s4, 24
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_readfirstlane_b32 s4, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252
@@ -200273,37 +200284,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s91, v32
+; SI-NEXT: v_readfirstlane_b32 s8, v32
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s8, v33
+; SI-NEXT: v_readfirstlane_b32 s11, v33
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT: v_writelane_b32 v43, s4, 24
+; SI-NEXT: v_writelane_b32 v43, s4, 25
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_readfirstlane_b32 s4, v34
-; SI-NEXT: v_writelane_b32 v43, s4, 25
+; SI-NEXT: v_writelane_b32 v43, s4, 26
; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_readfirstlane_b32 s4, v35
-; SI-NEXT: v_writelane_b32 v43, s4, 26
+; SI-NEXT: v_writelane_b32 v43, s4, 27
; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_readfirstlane_b32 s4, v36
-; SI-NEXT: v_writelane_b32 v43, s4, 27
+; SI-NEXT: v_writelane_b32 v43, s4, 28
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v37
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204
-; SI-NEXT: v_writelane_b32 v43, s4, 28
-; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s4, v31
; SI-NEXT: v_writelane_b32 v43, s4, 29
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_readfirstlane_b32 s89, v31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s89, v38
+; SI-NEXT: v_readfirstlane_b32 s70, v38
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s78, v39
+; SI-NEXT: v_readfirstlane_b32 s7, v39
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s7, v48
+; SI-NEXT: v_readfirstlane_b32 s10, v48
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s82, v49
; SI-NEXT: s_waitcnt vmcnt(7)
@@ -200317,39 +200327,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176
+; SI-NEXT: v_writelane_b32 v43, s4, 30
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s70, v33
+; SI-NEXT: v_readfirstlane_b32 s69, v33
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172
; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT: v_writelane_b32 v43, s4, 30
-; SI-NEXT: v_readfirstlane_b32 s4, v32
-; SI-NEXT: v_writelane_b32 v43, s4, 31
+; SI-NEXT: v_readfirstlane_b32 s92, v32
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s4, v34
-; SI-NEXT: v_writelane_b32 v43, s4, 32
+; SI-NEXT: v_readfirstlane_b32 s18, v34
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s9, v35
+; SI-NEXT: v_readfirstlane_b32 s12, v35
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_readfirstlane_b32 s13, v36
; SI-NEXT: s_waitcnt vmcnt(9)
; SI-NEXT: v_readfirstlane_b32 s4, v37
-; SI-NEXT: v_writelane_b32 v43, s4, 33
-; SI-NEXT: v_readfirstlane_b32 s10, v36
+; SI-NEXT: v_writelane_b32 v43, s4, 31
; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_readfirstlane_b32 s4, v31
-; SI-NEXT: v_writelane_b32 v43, s4, 34
+; SI-NEXT: v_writelane_b32 v43, s4, 32
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s4, v38
-; SI-NEXT: v_writelane_b32 v43, s4, 35
+; SI-NEXT: v_readfirstlane_b32 s90, v38
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s4, v39
-; SI-NEXT: v_writelane_b32 v43, s4, 36
+; SI-NEXT: v_readfirstlane_b32 s91, v39
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_readfirstlane_b32 s69, v48
+; SI-NEXT: v_readfirstlane_b32 s68, v48
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_readfirstlane_b32 s30, v49
+; SI-NEXT: v_readfirstlane_b32 s37, v49
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_readfirstlane_b32 s16, v50
+; SI-NEXT: v_readfirstlane_b32 s84, v50
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_readfirstlane_b32 s36, v51
+; SI-NEXT: v_readfirstlane_b32 s6, v51
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
; SI-NEXT: s_waitcnt vmcnt(3)
@@ -200365,50 +200372,51 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104
-; SI-NEXT: v_writelane_b32 v43, s4, 37
+; SI-NEXT: v_writelane_b32 v43, s4, 33
; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_readfirstlane_b32 s4, v52
-; SI-NEXT: v_writelane_b32 v43, s4, 38
+; SI-NEXT: v_writelane_b32 v43, s4, 34
; SI-NEXT: v_readfirstlane_b32 s4, v53
-; SI-NEXT: v_writelane_b32 v43, s4, 39
+; SI-NEXT: v_writelane_b32 v43, s4, 35
; SI-NEXT: v_readfirstlane_b32 s4, v54
-; SI-NEXT: v_writelane_b32 v43, s4, 40
-; SI-NEXT: v_writelane_b32 v43, s44, 41
-; SI-NEXT: v_writelane_b32 v43, s6, 42
-; SI-NEXT: v_writelane_b32 v43, s7, 43
-; SI-NEXT: v_writelane_b32 v43, s8, 44
-; SI-NEXT: v_writelane_b32 v43, s9, 45
-; SI-NEXT: v_writelane_b32 v43, s10, 46
-; SI-NEXT: v_writelane_b32 v43, s11, 47
-; SI-NEXT: v_writelane_b32 v43, s12, 48
-; SI-NEXT: v_writelane_b32 v43, s13, 49
-; SI-NEXT: v_writelane_b32 v43, s14, 50
-; SI-NEXT: v_writelane_b32 v43, s15, 51
-; SI-NEXT: v_writelane_b32 v43, s18, 52
-; SI-NEXT: v_writelane_b32 v43, s21, 53
-; SI-NEXT: v_writelane_b32 v43, s22, 54
-; SI-NEXT: v_writelane_b32 v43, s40, 55
-; SI-NEXT: v_writelane_b32 v43, s41, 56
-; SI-NEXT: v_writelane_b32 v43, s42, 57
-; SI-NEXT: v_writelane_b32 v43, s43, 58
-; SI-NEXT: v_writelane_b32 v43, s76, 59
-; SI-NEXT: v_writelane_b32 v43, s77, 60
+; SI-NEXT: v_writelane_b32 v43, s4, 36
+; SI-NEXT: v_writelane_b32 v43, s6, 37
+; SI-NEXT: v_writelane_b32 v43, s7, 38
+; SI-NEXT: v_writelane_b32 v43, s8, 39
+; SI-NEXT: v_writelane_b32 v43, s18, 40
+; SI-NEXT: v_writelane_b32 v43, s9, 41
+; SI-NEXT: v_writelane_b32 v43, s10, 42
+; SI-NEXT: v_writelane_b32 v43, s11, 43
+; SI-NEXT: v_writelane_b32 v43, s12, 44
+; SI-NEXT: v_writelane_b32 v43, s13, 45
+; SI-NEXT: v_writelane_b32 v43, s14, 46
+; SI-NEXT: v_writelane_b32 v43, s15, 47
+; SI-NEXT: v_writelane_b32 v43, s21, 48
+; SI-NEXT: v_writelane_b32 v43, s22, 49
+; SI-NEXT: v_writelane_b32 v43, s40, 50
+; SI-NEXT: v_writelane_b32 v43, s41, 51
+; SI-NEXT: v_writelane_b32 v43, s42, 52
+; SI-NEXT: v_writelane_b32 v43, s43, 53
+; SI-NEXT: v_writelane_b32 v43, s76, 54
+; SI-NEXT: v_writelane_b32 v43, s77, 55
+; SI-NEXT: v_writelane_b32 v43, s46, 56
+; SI-NEXT: v_writelane_b32 v43, s47, 57
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s17, v33
+; SI-NEXT: v_readfirstlane_b32 s16, v33
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s98, v34
+; SI-NEXT: v_readfirstlane_b32 s35, v34
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s23, v35
-; SI-NEXT: v_readfirstlane_b32 s25, v31
-; SI-NEXT: v_readfirstlane_b32 s28, v32
+; SI-NEXT: v_readfirstlane_b32 s19, v35
+; SI-NEXT: v_readfirstlane_b32 s28, v31
+; SI-NEXT: v_readfirstlane_b32 s29, v32
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s26, v36
+; SI-NEXT: v_readfirstlane_b32 s87, v36
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s88, v37
+; SI-NEXT: v_readfirstlane_b32 s79, v37
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_readfirstlane_b32 s79, v38
+; SI-NEXT: v_readfirstlane_b32 s27, v38
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_readfirstlane_b32 s75, v39
+; SI-NEXT: v_readfirstlane_b32 s25, v39
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
@@ -200421,39 +200429,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s24, v49
+; SI-NEXT: v_readfirstlane_b32 s88, v49
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s85, v50
+; SI-NEXT: v_readfirstlane_b32 s86, v50
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s66, v51
+; SI-NEXT: v_readfirstlane_b32 s34, v51
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 vcc_lo, v13
-; SI-NEXT: v_readfirstlane_b32 vcc_hi, v14
-; SI-NEXT: v_writelane_b32 v43, vcc_lo, 61
-; SI-NEXT: v_writelane_b32 v43, vcc_hi, 62
-; SI-NEXT: v_writelane_b32 v43, s38, 63
+; SI-NEXT: v_readfirstlane_b32 vcc_lo, v16
+; SI-NEXT: v_readfirstlane_b32 vcc_hi, v15
+; SI-NEXT: v_writelane_b32 v43, vcc_lo, 58
+; SI-NEXT: v_writelane_b32 v43, vcc_hi, 59
+; SI-NEXT: v_writelane_b32 v43, s38, 60
+; SI-NEXT: v_writelane_b32 v43, s39, 61
+; SI-NEXT: v_writelane_b32 v43, s48, 62
+; SI-NEXT: v_writelane_b32 v43, s49, 63
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s20, v31
+; SI-NEXT: v_readfirstlane_b32 s17, v31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s19, v32
+; SI-NEXT: v_readfirstlane_b32 s20, v32
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s27, v33
+; SI-NEXT: v_readfirstlane_b32 s94, v33
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s94, v34
+; SI-NEXT: v_readfirstlane_b32 s26, v34
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s72, v35
+; SI-NEXT: v_readfirstlane_b32 s73, v35
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s73, v36
+; SI-NEXT: v_readfirstlane_b32 s74, v36
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s67, v37
+; SI-NEXT: v_readfirstlane_b32 s80, v37
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_readfirstlane_b32 s71, v38
+; SI-NEXT: v_readfirstlane_b32 s81, v38
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_readfirstlane_b32 s97, v39
+; SI-NEXT: v_readfirstlane_b32 s36, v39
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
@@ -200463,141 +200474,141 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s35, v48
+; SI-NEXT: v_readfirstlane_b32 s31, v48
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s83, v49
+; SI-NEXT: v_readfirstlane_b32 s23, v49
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s87, v50
+; SI-NEXT: v_readfirstlane_b32 s83, v50
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s63, v51
+; SI-NEXT: v_readfirstlane_b32 s72, v51
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_readfirstlane_b32 s74, v31
+; SI-NEXT: v_readfirstlane_b32 s75, v31
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_readfirstlane_b32 s81, v32
+; SI-NEXT: v_readfirstlane_b32 s78, v32
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_readfirstlane_b32 s80, v33
+; SI-NEXT: v_readfirstlane_b32 s97, v33
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_readfirstlane_b32 s86, v34
+; SI-NEXT: v_readfirstlane_b32 s98, v34
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_readfirstlane_b32 s34, v35
+; SI-NEXT: v_readfirstlane_b32 s30, v35
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_readfirstlane_b32 s84, v36
+; SI-NEXT: v_readfirstlane_b32 s85, v36
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_readfirstlane_b32 s31, v37
+; SI-NEXT: v_readfirstlane_b32 s66, v37
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_readfirstlane_b32 s61, v38
+; SI-NEXT: v_readfirstlane_b32 s62, v38
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_readfirstlane_b32 s62, v39
+; SI-NEXT: v_readfirstlane_b32 s63, v39
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_readfirstlane_b32 s53, v48
+; SI-NEXT: v_readfirstlane_b32 s55, v48
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_readfirstlane_b32 s52, v49
-; SI-NEXT: v_writelane_b32 v42, s52, 7
-; SI-NEXT: v_writelane_b32 v42, s53, 8
-; SI-NEXT: v_writelane_b32 v42, s57, 9
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_readfirstlane_b32 s54, v50
-; SI-NEXT: v_writelane_b32 v42, s58, 10
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s55, v51
-; SI-NEXT: v_writelane_b32 v42, s54, 11
-; SI-NEXT: v_writelane_b32 v42, s55, 12
-; SI-NEXT: v_writelane_b32 v42, s64, 13
-; SI-NEXT: v_writelane_b32 v42, s65, 14
-; SI-NEXT: v_writelane_b32 v42, s67, 15
-; SI-NEXT: v_writelane_b32 v42, s71, 16
-; SI-NEXT: v_writelane_b32 v42, s80, 17
-; SI-NEXT: v_writelane_b32 v42, s81, 18
-; SI-NEXT: v_writelane_b32 v42, s59, 19
-; SI-NEXT: v_writelane_b32 v42, s60, 20
-; SI-NEXT: v_writelane_b32 v42, s86, 21
-; SI-NEXT: v_writelane_b32 v42, s97, 22
+; SI-NEXT: v_readfirstlane_b32 s54, v49
+; SI-NEXT: v_writelane_b32 v42, s54, 6
+; SI-NEXT: v_writelane_b32 v42, s55, 7
+; SI-NEXT: v_writelane_b32 v42, s58, 8
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_readfirstlane_b32 s64, v50
+; SI-NEXT: v_writelane_b32 v42, s59, 9
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_readfirstlane_b32 s65, v51
+; SI-NEXT: v_writelane_b32 v42, s64, 10
+; SI-NEXT: v_writelane_b32 v42, s65, 11
+; SI-NEXT: v_writelane_b32 v42, s67, 12
+; SI-NEXT: v_writelane_b32 v42, s71, 13
+; SI-NEXT: v_writelane_b32 v42, s80, 14
+; SI-NEXT: v_writelane_b32 v42, s81, 15
+; SI-NEXT: v_writelane_b32 v42, s97, 16
+; SI-NEXT: v_writelane_b32 v42, s78, 17
+; SI-NEXT: v_writelane_b32 v42, s60, 18
+; SI-NEXT: v_writelane_b32 v42, s61, 19
+; SI-NEXT: v_writelane_b32 v42, s98, 20
+; SI-NEXT: v_writelane_b32 v42, s36, 21
+; SI-NEXT: v_writelane_b32 v42, s30, 22
; SI-NEXT: v_writelane_b32 v42, s34, 23
-; SI-NEXT: v_writelane_b32 v42, s66, 24
-; SI-NEXT: v_writelane_b32 v42, s85, 25
-; SI-NEXT: v_writelane_b32 v42, s31, 26
-; SI-NEXT: v_writelane_b32 v42, s84, 27
+; SI-NEXT: v_writelane_b32 v42, s86, 24
+; SI-NEXT: v_writelane_b32 v42, s66, 25
+; SI-NEXT: v_writelane_b32 v42, s85, 26
+; SI-NEXT: v_writelane_b32 v42, s31, 27
; SI-NEXT: v_writelane_b32 v42, s35, 28
-; SI-NEXT: v_writelane_b32 v42, s98, 29
+; SI-NEXT: v_writelane_b32 v42, s16, 29
; SI-NEXT: v_writelane_b32 v42, s17, 30
-; SI-NEXT: v_writelane_b32 v42, s20, 31
-; SI-NEXT: v_writelane_b32 v42, s61, 32
-; SI-NEXT: v_writelane_b32 v42, s19, 33
-; SI-NEXT: v_writelane_b32 v42, s62, 34
+; SI-NEXT: v_writelane_b32 v42, s62, 31
+; SI-NEXT: v_writelane_b32 v42, s20, 32
+; SI-NEXT: v_writelane_b32 v42, s63, 33
+; SI-NEXT: v_writelane_b32 v42, s19, 34
; SI-NEXT: v_writelane_b32 v42, s23, 35
; SI-NEXT: v_writelane_b32 v42, s83, 36
; SI-NEXT: v_writelane_b32 v42, s87, 37
; SI-NEXT: v_writelane_b32 v42, s26, 38
; SI-NEXT: v_writelane_b32 v42, s94, 39
-; SI-NEXT: v_writelane_b32 v42, s27, 40
-; SI-NEXT: v_writelane_b32 v42, s63, 41
+; SI-NEXT: v_writelane_b32 v42, s72, 40
+; SI-NEXT: v_writelane_b32 v42, s27, 41
; SI-NEXT: v_writelane_b32 v42, s79, 42
-; SI-NEXT: v_writelane_b32 v42, s88, 43
-; SI-NEXT: v_writelane_b32 v42, s72, 44
-; SI-NEXT: v_writelane_b32 v42, s73, 45
-; SI-NEXT: v_writelane_b32 v42, s74, 46
-; SI-NEXT: v_writelane_b32 v42, s75, 47
-; SI-NEXT: v_writelane_b32 v42, s24, 48
-; SI-NEXT: v_writelane_b32 v42, s25, 49
-; SI-NEXT: v_writelane_b32 v42, s28, 50
+; SI-NEXT: v_writelane_b32 v42, s73, 43
+; SI-NEXT: v_writelane_b32 v42, s74, 44
+; SI-NEXT: v_writelane_b32 v42, s75, 45
+; SI-NEXT: v_writelane_b32 v42, s25, 46
+; SI-NEXT: v_writelane_b32 v42, s88, 47
+; SI-NEXT: v_writelane_b32 v42, s28, 48
+; SI-NEXT: v_writelane_b32 v42, s29, 49
; SI-NEXT: s_cbranch_scc0 .LBB97_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_readlane_b32 s4, v43, 13
; SI-NEXT: v_readlane_b32 s5, v43, 12
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
-; SI-NEXT: s_or_b32 s29, s4, s5
+; SI-NEXT: s_or_b32 s44, s4, s5
; SI-NEXT: v_readlane_b32 s4, v43, 5
; SI-NEXT: v_readlane_b32 s5, v43, 4
; SI-NEXT: s_and_b32 s4, s4, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 8
; SI-NEXT: s_or_b32 s45, s4, s5
-; SI-NEXT: s_and_b32 s4, s43, 0xff
-; SI-NEXT: s_lshl_b32 s5, s42, 8
+; SI-NEXT: s_and_b32 s4, s47, 0xff
+; SI-NEXT: s_lshl_b32 s5, s46, 8
; SI-NEXT: s_or_b32 s46, s4, s5
-; SI-NEXT: s_and_b32 s4, s56, 0xff
-; SI-NEXT: s_lshl_b32 s5, s47, 8
+; SI-NEXT: s_and_b32 s4, s57, 0xff
+; SI-NEXT: s_lshl_b32 s5, s56, 8
; SI-NEXT: s_or_b32 s47, s4, s5
-; SI-NEXT: s_and_b32 s4, s58, 0xff
-; SI-NEXT: s_lshl_b32 s5, s57, 8
+; SI-NEXT: s_and_b32 s4, s59, 0xff
+; SI-NEXT: s_lshl_b32 s5, s58, 8
; SI-NEXT: s_or_b32 s56, s4, s5
-; SI-NEXT: s_and_b32 s4, s60, 0xff
-; SI-NEXT: s_lshl_b32 s5, s59, 8
+; SI-NEXT: s_and_b32 s4, s61, 0xff
+; SI-NEXT: s_lshl_b32 s5, s60, 8
; SI-NEXT: s_or_b32 s57, s4, s5
-; SI-NEXT: s_and_b32 s4, s62, 0xff
-; SI-NEXT: s_lshl_b32 s5, s61, 8
+; SI-NEXT: s_and_b32 s4, s63, 0xff
+; SI-NEXT: s_lshl_b32 s5, s62, 8
; SI-NEXT: s_or_b32 s58, s4, s5
-; SI-NEXT: s_and_b32 s4, s74, 0xff
-; SI-NEXT: s_lshl_b32 s5, s63, 8
-; SI-NEXT: s_or_b32 s59, s4, s5
-; SI-NEXT: s_and_b32 s4, s73, 0xff
+; SI-NEXT: s_and_b32 s4, s75, 0xff
; SI-NEXT: s_lshl_b32 s5, s72, 8
+; SI-NEXT: s_or_b32 s59, s4, s5
+; SI-NEXT: s_and_b32 s4, s74, 0xff
+; SI-NEXT: s_lshl_b32 s5, s73, 8
; SI-NEXT: s_or_b32 s60, s4, s5
-; SI-NEXT: s_and_b32 s4, s24, 0xff
-; SI-NEXT: s_lshl_b32 s5, s75, 8
-; SI-NEXT: s_or_b32 s61, s4, s5
-; SI-NEXT: s_and_b32 s4, s28, 0xff
+; SI-NEXT: s_and_b32 s4, s88, 0xff
; SI-NEXT: s_lshl_b32 s5, s25, 8
+; SI-NEXT: s_or_b32 s61, s4, s5
+; SI-NEXT: s_and_b32 s4, s29, 0xff
+; SI-NEXT: s_lshl_b32 s5, s28, 8
; SI-NEXT: s_or_b32 s62, s4, s5
-; SI-NEXT: s_and_b32 s4, s36, 0xff
-; SI-NEXT: s_lshl_b32 s5, s16, 8
+; SI-NEXT: s_and_b32 s4, s6, 0xff
+; SI-NEXT: s_lshl_b32 s5, s84, 8
; SI-NEXT: s_or_b32 s63, s4, s5
-; SI-NEXT: s_and_b32 s4, s10, 0xff
-; SI-NEXT: s_lshl_b32 s5, s9, 8
+; SI-NEXT: s_and_b32 s4, s13, 0xff
+; SI-NEXT: s_lshl_b32 s5, s12, 8
; SI-NEXT: s_or_b32 s72, s4, s5
-; SI-NEXT: s_and_b32 s4, s7, 0xff
-; SI-NEXT: s_lshl_b32 s5, s78, 8
+; SI-NEXT: s_and_b32 s4, s10, 0xff
+; SI-NEXT: s_lshl_b32 s5, s7, 8
; SI-NEXT: s_or_b32 s73, s4, s5
-; SI-NEXT: s_and_b32 s4, s8, 0xff
-; SI-NEXT: s_lshl_b32 s5, s91, 8
+; SI-NEXT: s_and_b32 s4, s11, 0xff
+; SI-NEXT: s_lshl_b32 s5, s8, 8
; SI-NEXT: s_or_b32 s74, s4, s5
-; SI-NEXT: s_and_b32 s4, s6, 0xff
-; SI-NEXT: s_lshl_b32 s5, s90, 8
+; SI-NEXT: s_and_b32 s4, s9, 0xff
+; SI-NEXT: s_lshl_b32 s5, s24, 8
; SI-NEXT: s_or_b32 s75, s4, s5
; SI-NEXT: v_readlane_b32 s4, v43, 9
; SI-NEXT: v_readlane_b32 s5, v43, 8
@@ -200616,7 +200627,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshl_b32 s6, s6, 24
; SI-NEXT: s_and_b32 s5, s5, 0xffff
-; SI-NEXT: v_writelane_b32 v42, s7, 51
+; SI-NEXT: v_writelane_b32 v42, s7, 52
; SI-NEXT: s_or_b32 s4, s6, s4
; SI-NEXT: s_or_b32 s5, s5, s7
; SI-NEXT: v_readlane_b32 s6, v43, 1
@@ -200624,345 +200635,340 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_and_b32 s6, s6, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 8
; SI-NEXT: s_or_b32 s7, s6, s7
-; SI-NEXT: s_and_b32 s6, s11, 0xff
+; SI-NEXT: s_and_b32 s6, s14, 0xff
; SI-NEXT: s_lshl_b32 s6, s6, 16
-; SI-NEXT: s_lshl_b32 s8, s12, 24
-; SI-NEXT: s_or_b32 s37, s8, s6
+; SI-NEXT: s_lshl_b32 s8, s15, 24
+; SI-NEXT: s_or_b32 s9, s8, s6
; SI-NEXT: v_readlane_b32 s6, v43, 3
; SI-NEXT: s_and_b32 s6, s6, 0xff
; SI-NEXT: v_readlane_b32 s8, v43, 2
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_lshl_b32 s8, s8, 24
+; SI-NEXT: s_and_b32 s7, s7, 0xffff
+; SI-NEXT: v_writelane_b32 v42, s9, 53
; SI-NEXT: s_or_b32 s6, s8, s6
-; SI-NEXT: s_and_b32 s8, s18, 0xff
-; SI-NEXT: s_lshl_b32 s9, s15, 8
+; SI-NEXT: s_or_b32 s7, s7, s9
+; SI-NEXT: s_and_b32 s8, s41, 0xff
+; SI-NEXT: s_lshl_b32 s9, s40, 8
; SI-NEXT: s_or_b32 s9, s8, s9
-; SI-NEXT: s_and_b32 s8, s13, 0xff
-; SI-NEXT: s_lshl_b32 s8, s8, 16
-; SI-NEXT: s_lshl_b32 s10, s14, 24
-; SI-NEXT: s_or_b32 s68, s10, s8
; SI-NEXT: s_and_b32 s8, s21, 0xff
; SI-NEXT: s_lshl_b32 s8, s8, 16
; SI-NEXT: s_lshl_b32 s10, s22, 24
+; SI-NEXT: s_or_b32 s11, s10, s8
+; SI-NEXT: s_and_b32 s8, s42, 0xff
+; SI-NEXT: s_lshl_b32 s8, s8, 16
+; SI-NEXT: s_lshl_b32 s10, s43, 24
+; SI-NEXT: s_and_b32 s9, s9, 0xffff
+; SI-NEXT: v_writelane_b32 v42, s11, 54
; SI-NEXT: s_or_b32 s8, s10, s8
-; SI-NEXT: s_and_b32 s10, s77, 0xff
-; SI-NEXT: s_lshl_b32 s11, s76, 8
+; SI-NEXT: s_or_b32 s9, s9, s11
+; SI-NEXT: s_and_b32 s10, vcc_hi, 0xff
+; SI-NEXT: s_lshl_b32 s11, vcc_lo, 8
; SI-NEXT: s_or_b32 s11, s10, s11
-; SI-NEXT: s_and_b32 s10, s40, 0xff
+; SI-NEXT: s_and_b32 s10, s76, 0xff
; SI-NEXT: s_lshl_b32 s10, s10, 16
-; SI-NEXT: s_lshl_b32 s12, s41, 24
-; SI-NEXT: s_or_b32 s99, s12, s10
-; SI-NEXT: s_and_b32 s10, vcc_lo, 0xff
+; SI-NEXT: s_lshl_b32 s12, s77, 24
+; SI-NEXT: s_or_b32 s13, s12, s10
+; SI-NEXT: s_and_b32 s10, s38, 0xff
; SI-NEXT: s_lshl_b32 s10, s10, 16
-; SI-NEXT: s_lshl_b32 s12, vcc_hi, 24
+; SI-NEXT: s_lshl_b32 s12, s39, 24
+; SI-NEXT: s_and_b32 s11, s11, 0xffff
+; SI-NEXT: v_writelane_b32 v42, s13, 55
; SI-NEXT: s_or_b32 s10, s12, s10
-; SI-NEXT: s_and_b32 s12, s49, 0xff
-; SI-NEXT: s_lshl_b32 s13, s48, 8
+; SI-NEXT: s_or_b32 s11, s11, s13
+; SI-NEXT: s_and_b32 s12, s51, 0xff
+; SI-NEXT: s_lshl_b32 s13, s50, 8
; SI-NEXT: s_or_b32 s13, s12, s13
-; SI-NEXT: s_and_b32 s12, s38, 0xff
+; SI-NEXT: s_and_b32 s12, s48, 0xff
; SI-NEXT: s_lshl_b32 s12, s12, 16
-; SI-NEXT: s_lshl_b32 s14, s39, 24
-; SI-NEXT: s_or_b32 s92, s14, s12
-; SI-NEXT: s_and_b32 s12, s50, 0xff
+; SI-NEXT: s_lshl_b32 s14, s49, 24
+; SI-NEXT: s_or_b32 s99, s14, s12
+; SI-NEXT: s_and_b32 s12, s52, 0xff
; SI-NEXT: s_lshl_b32 s12, s12, 16
-; SI-NEXT: s_lshl_b32 s14, s51, 24
+; SI-NEXT: s_lshl_b32 s14, s53, 24
; SI-NEXT: s_or_b32 s12, s14, s12
-; SI-NEXT: s_and_b32 s14, s55, 0xff
-; SI-NEXT: s_lshl_b32 s15, s54, 8
+; SI-NEXT: s_and_b32 s14, s65, 0xff
+; SI-NEXT: s_lshl_b32 s15, s64, 8
; SI-NEXT: s_or_b32 s15, s14, s15
-; SI-NEXT: s_and_b32 s14, s52, 0xff
+; SI-NEXT: s_and_b32 s14, s54, 0xff
; SI-NEXT: s_lshl_b32 s14, s14, 16
-; SI-NEXT: s_lshl_b32 s24, s53, 24
-; SI-NEXT: s_mov_b32 s28, s90
-; SI-NEXT: s_or_b32 s90, s24, s14
-; SI-NEXT: s_and_b32 s14, s64, 0xff
+; SI-NEXT: v_writelane_b32 v42, s24, 56
+; SI-NEXT: s_lshl_b32 s24, s55, 24
+; SI-NEXT: s_or_b32 s24, s24, s14
+; SI-NEXT: s_and_b32 s14, s67, 0xff
; SI-NEXT: s_lshl_b32 s14, s14, 16
-; SI-NEXT: s_lshl_b32 s25, s65, 24
+; SI-NEXT: s_lshl_b32 s25, s71, 24
; SI-NEXT: s_or_b32 s14, s25, s14
-; SI-NEXT: s_and_b32 s25, s34, 0xff
-; SI-NEXT: s_lshl_b32 s40, s86, 8
+; SI-NEXT: s_and_b32 s25, s30, 0xff
+; SI-NEXT: s_lshl_b32 s40, s98, 8
; SI-NEXT: s_or_b32 s41, s25, s40
-; SI-NEXT: s_and_b32 s25, s80, 0xff
+; SI-NEXT: s_and_b32 s25, s97, 0xff
; SI-NEXT: s_lshl_b32 s25, s25, 16
-; SI-NEXT: s_lshl_b32 s40, s81, 24
-; SI-NEXT: s_or_b32 s18, s40, s25
-; SI-NEXT: s_and_b32 s40, s31, 0xff
+; SI-NEXT: s_lshl_b32 s40, s78, 24
+; SI-NEXT: s_or_b32 s25, s40, s25
+; SI-NEXT: s_and_b32 s40, s66, 0xff
; SI-NEXT: s_lshl_b32 s40, s40, 16
-; SI-NEXT: s_lshl_b32 s42, s84, 24
+; SI-NEXT: s_lshl_b32 s42, s85, 24
; SI-NEXT: s_or_b32 s40, s42, s40
-; SI-NEXT: s_and_b32 s42, s35, 0xff
-; SI-NEXT: s_lshl_b32 s43, s97, 8
+; SI-NEXT: s_and_b32 s42, s31, 0xff
+; SI-NEXT: s_lshl_b32 s43, s36, 8
; SI-NEXT: s_or_b32 s43, s42, s43
-; SI-NEXT: s_and_b32 s42, s71, 0xff
+; SI-NEXT: s_and_b32 s42, s81, 0xff
; SI-NEXT: s_lshl_b32 s42, s42, 16
-; SI-NEXT: s_lshl_b32 s76, s67, 24
-; SI-NEXT: s_or_b32 s35, s76, s42
-; SI-NEXT: s_and_b32 s42, s87, 0xff
+; SI-NEXT: s_lshl_b32 s76, s80, 24
+; SI-NEXT: s_or_b32 s21, s76, s42
+; SI-NEXT: s_and_b32 s42, s83, 0xff
; SI-NEXT: s_lshl_b32 s42, s42, 16
-; SI-NEXT: s_lshl_b32 s76, s83, 24
+; SI-NEXT: s_lshl_b32 s76, s23, 24
; SI-NEXT: s_or_b32 s42, s76, s42
-; SI-NEXT: s_and_b32 s76, s19, 0xff
-; SI-NEXT: s_lshl_b32 s77, s20, 8
+; SI-NEXT: s_and_b32 s76, s20, 0xff
+; SI-NEXT: s_lshl_b32 s77, s17, 8
; SI-NEXT: s_or_b32 s76, s76, s77
-; SI-NEXT: s_and_b32 s77, s66, 0xff
-; SI-NEXT: v_writelane_b32 v42, s78, 52
+; SI-NEXT: s_and_b32 s77, s34, 0xff
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s85, 24
-; SI-NEXT: s_or_b32 s19, s78, s77
-; SI-NEXT: s_and_b32 s77, s94, 0xff
+; SI-NEXT: s_lshl_b32 s78, s86, 24
+; SI-NEXT: s_or_b32 s17, s78, s77
+; SI-NEXT: s_and_b32 s77, s26, 0xff
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s27, 24
+; SI-NEXT: s_lshl_b32 s78, s94, 24
; SI-NEXT: s_and_b32 s76, s76, 0xffff
; SI-NEXT: s_or_b32 vcc_lo, s78, s77
-; SI-NEXT: s_or_b32 vcc_hi, s76, s19
-; SI-NEXT: s_and_b32 s76, s26, 0xff
-; SI-NEXT: s_lshl_b32 s77, s23, 8
+; SI-NEXT: s_or_b32 vcc_hi, s76, s17
+; SI-NEXT: s_and_b32 s76, s87, 0xff
+; SI-NEXT: s_lshl_b32 s77, s19, 8
; SI-NEXT: s_or_b32 s76, s76, s77
-; SI-NEXT: s_and_b32 s77, s98, 0xff
+; SI-NEXT: s_and_b32 s77, s35, 0xff
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s17, 24
+; SI-NEXT: s_lshl_b32 s78, s16, 24
; SI-NEXT: s_or_b32 s71, s78, s77
-; SI-NEXT: s_and_b32 s77, s79, 0xff
+; SI-NEXT: s_and_b32 s77, s27, 0xff
; SI-NEXT: s_and_b32 s76, s76, 0xffff
-; SI-NEXT: v_readlane_b32 s17, v43, 40
-; SI-NEXT: s_and_b32 s41, s41, 0xffff
+; SI-NEXT: v_readlane_b32 s16, v43, 36
+; SI-NEXT: s_and_b32 s43, s43, 0xffff
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s88, 24
+; SI-NEXT: s_lshl_b32 s78, s79, 24
; SI-NEXT: s_or_b32 s39, s76, s71
-; SI-NEXT: s_and_b32 s76, s17, 0xff
-; SI-NEXT: v_readlane_b32 s17, v43, 39
-; SI-NEXT: s_or_b32 s41, s41, s18
-; SI-NEXT: s_mov_b32 s31, s18
+; SI-NEXT: s_and_b32 s76, s16, 0xff
+; SI-NEXT: v_readlane_b32 s16, v43, 35
+; SI-NEXT: s_or_b32 s43, s43, s21
+; SI-NEXT: s_mov_b32 s23, s21
; SI-NEXT: s_or_b32 s38, s78, s77
-; SI-NEXT: s_lshl_b32 s77, s17, 8
-; SI-NEXT: v_readlane_b32 s18, v43, 38
+; SI-NEXT: s_lshl_b32 s77, s16, 8
+; SI-NEXT: v_readlane_b32 s21, v43, 34
; SI-NEXT: s_or_b32 s76, s76, s77
-; SI-NEXT: s_and_b32 s77, s18, 0xff
-; SI-NEXT: v_readlane_b32 s18, v43, 37
+; SI-NEXT: s_and_b32 s77, s21, 0xff
+; SI-NEXT: v_readlane_b32 s21, v43, 33
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s18, 24
+; SI-NEXT: s_lshl_b32 s78, s21, 24
; SI-NEXT: s_or_b32 s80, s78, s77
-; SI-NEXT: s_and_b32 s77, s95, 0xff
-; SI-NEXT: s_and_b32 s76, s76, 0xffff
-; SI-NEXT: v_readlane_b32 s17, v43, 36
+; SI-NEXT: s_and_b32 s77, s93, 0xff
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s93, 24
-; SI-NEXT: s_or_b32 s49, s76, s80
-; SI-NEXT: s_and_b32 s76, s17, 0xff
-; SI-NEXT: v_readlane_b32 s17, v43, 35
+; SI-NEXT: s_lshl_b32 s78, s95, 24
+; SI-NEXT: s_and_b32 s76, s76, 0xffff
; SI-NEXT: s_or_b32 s48, s78, s77
-; SI-NEXT: s_lshl_b32 s77, s17, 8
-; SI-NEXT: v_readlane_b32 s17, v43, 34
+; SI-NEXT: s_or_b32 s49, s76, s80
+; SI-NEXT: s_and_b32 s76, s91, 0xff
+; SI-NEXT: s_lshl_b32 s77, s90, 8
+; SI-NEXT: v_readlane_b32 s16, v43, 32
; SI-NEXT: s_or_b32 s76, s76, s77
-; SI-NEXT: s_and_b32 s77, s17, 0xff
-; SI-NEXT: v_readlane_b32 s17, v43, 33
+; SI-NEXT: s_and_b32 s77, s16, 0xff
+; SI-NEXT: v_readlane_b32 s16, v43, 31
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s17, 24
+; SI-NEXT: s_lshl_b32 s78, s16, 24
; SI-NEXT: s_or_b32 s81, s78, s77
-; SI-NEXT: s_and_b32 s77, s30, 0xff
+; SI-NEXT: s_and_b32 s77, s37, 0xff
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s69, 24
+; SI-NEXT: s_lshl_b32 s78, s68, 24
; SI-NEXT: s_and_b32 s76, s76, 0xffff
-; SI-NEXT: v_readlane_b32 s17, v43, 31
; SI-NEXT: s_or_b32 s50, s78, s77
; SI-NEXT: s_or_b32 s51, s76, s81
-; SI-NEXT: s_and_b32 s76, s17, 0xff
+; SI-NEXT: s_and_b32 s76, s92, 0xff
; SI-NEXT: s_lshl_b32 s77, s96, 8
-; SI-NEXT: v_readlane_b32 s17, v43, 30
+; SI-NEXT: v_readlane_b32 s16, v43, 30
; SI-NEXT: s_or_b32 s76, s76, s77
-; SI-NEXT: s_and_b32 s77, s17, 0xff
+; SI-NEXT: s_and_b32 s77, s16, 0xff
; SI-NEXT: s_lshl_b32 s77, s77, 16
; SI-NEXT: s_lshl_b32 s78, s82, 24
-; SI-NEXT: v_writelane_b32 v42, s96, 53
-; SI-NEXT: v_readlane_b32 s18, v43, 32
-; SI-NEXT: v_writelane_b32 v42, s82, 54
+; SI-NEXT: v_writelane_b32 v42, s96, 57
+; SI-NEXT: v_writelane_b32 v42, s82, 58
; SI-NEXT: s_or_b32 s82, s78, s77
; SI-NEXT: s_and_b32 s77, s18, 0xff
; SI-NEXT: s_and_b32 s76, s76, 0xffff
-; SI-NEXT: v_readlane_b32 s17, v43, 28
+; SI-NEXT: v_readlane_b32 s16, v43, 29
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s70, 24
+; SI-NEXT: s_lshl_b32 s78, s69, 24
; SI-NEXT: s_or_b32 s53, s76, s82
-; SI-NEXT: s_and_b32 s76, s17, 0xff
-; SI-NEXT: v_readlane_b32 s17, v43, 27
+; SI-NEXT: s_and_b32 s76, s16, 0xff
+; SI-NEXT: v_readlane_b32 s16, v43, 28
; SI-NEXT: s_or_b32 s52, s78, s77
-; SI-NEXT: s_lshl_b32 s77, s17, 8
-; SI-NEXT: v_readlane_b32 s18, v43, 26
+; SI-NEXT: s_lshl_b32 s77, s16, 8
+; SI-NEXT: v_readlane_b32 s16, v43, 27
; SI-NEXT: s_or_b32 s76, s76, s77
-; SI-NEXT: s_and_b32 s77, s18, 0xff
-; SI-NEXT: v_readlane_b32 s17, v43, 25
+; SI-NEXT: s_and_b32 s77, s16, 0xff
+; SI-NEXT: v_readlane_b32 s16, v43, 26
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s17, 24
-; SI-NEXT: v_writelane_b32 v42, s16, 55
-; SI-NEXT: s_or_b32 s16, s78, s77
-; SI-NEXT: s_and_b32 s77, s89, 0xff
-; SI-NEXT: v_readlane_b32 s18, v43, 29
+; SI-NEXT: s_lshl_b32 s78, s16, 24
+; SI-NEXT: v_writelane_b32 v42, s84, 59
+; SI-NEXT: s_or_b32 s84, s78, s77
+; SI-NEXT: s_and_b32 s77, s70, 0xff
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s18, 24
+; SI-NEXT: s_lshl_b32 s78, s89, 24
; SI-NEXT: s_and_b32 s76, s76, 0xffff
-; SI-NEXT: v_readlane_b32 s17, v43, 22
-; SI-NEXT: v_readlane_b32 s18, v43, 21
+; SI-NEXT: v_readlane_b32 s16, v43, 23
+; SI-NEXT: v_readlane_b32 s18, v43, 22
; SI-NEXT: s_or_b32 s54, s78, s77
-; SI-NEXT: s_or_b32 s55, s76, s16
-; SI-NEXT: s_and_b32 s76, s17, 0xff
+; SI-NEXT: s_or_b32 s55, s76, s84
+; SI-NEXT: s_and_b32 s76, s16, 0xff
; SI-NEXT: s_lshl_b32 s77, s18, 8
-; SI-NEXT: v_readlane_b32 s17, v43, 20
+; SI-NEXT: v_readlane_b32 s16, v43, 21
; SI-NEXT: s_or_b32 s76, s76, s77
-; SI-NEXT: s_and_b32 s77, s17, 0xff
-; SI-NEXT: v_readlane_b32 s17, v43, 19
+; SI-NEXT: s_and_b32 s77, s16, 0xff
+; SI-NEXT: v_readlane_b32 s16, v43, 20
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s17, 24
-; SI-NEXT: v_readlane_b32 s17, v43, 24
+; SI-NEXT: s_lshl_b32 s78, s16, 24
+; SI-NEXT: v_readlane_b32 s16, v43, 25
; SI-NEXT: s_or_b32 s83, s78, s77
-; SI-NEXT: s_and_b32 s77, s17, 0xff
-; SI-NEXT: v_readlane_b32 s17, v43, 23
+; SI-NEXT: s_and_b32 s77, s16, 0xff
+; SI-NEXT: v_readlane_b32 s16, v43, 24
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s17, 24
+; SI-NEXT: s_lshl_b32 s78, s16, 24
; SI-NEXT: s_and_b32 s76, s76, 0xffff
-; SI-NEXT: v_readlane_b32 s17, v43, 17
+; SI-NEXT: v_readlane_b32 s16, v43, 17
; SI-NEXT: v_readlane_b32 s18, v43, 16
; SI-NEXT: s_or_b32 s64, s78, s77
; SI-NEXT: s_or_b32 s65, s76, s83
-; SI-NEXT: s_and_b32 s76, s17, 0xff
+; SI-NEXT: s_and_b32 s76, s16, 0xff
; SI-NEXT: s_lshl_b32 s77, s18, 8
; SI-NEXT: v_readlane_b32 s18, v43, 15
; SI-NEXT: s_or_b32 s76, s76, s77
; SI-NEXT: s_and_b32 s77, s18, 0xff
; SI-NEXT: v_readlane_b32 s18, v43, 14
-; SI-NEXT: v_writelane_b32 v42, s89, 56
; SI-NEXT: s_lshl_b32 s77, s77, 16
; SI-NEXT: s_lshl_b32 s78, s18, 24
-; SI-NEXT: v_writelane_b32 v42, s70, 57
+; SI-NEXT: v_readlane_b32 s16, v43, 19
; SI-NEXT: s_or_b32 s85, s78, s77
-; SI-NEXT: s_and_b32 s77, s44, 0xff
-; SI-NEXT: v_readlane_b32 s18, v43, 18
-; SI-NEXT: s_and_b32 s7, s7, 0xffff
-; SI-NEXT: s_and_b32 s9, s9, 0xffff
-; SI-NEXT: s_and_b32 s11, s11, 0xffff
-; SI-NEXT: s_and_b32 s13, s13, 0xffff
-; SI-NEXT: s_and_b32 s15, s15, 0xffff
-; SI-NEXT: s_and_b32 s43, s43, 0xffff
-; SI-NEXT: v_writelane_b32 v42, s69, 58
+; SI-NEXT: s_and_b32 s77, s16, 0xff
+; SI-NEXT: v_readlane_b32 s16, v43, 18
+; SI-NEXT: v_writelane_b32 v42, s89, 60
; SI-NEXT: s_lshl_b32 s77, s77, 16
-; SI-NEXT: s_lshl_b32 s78, s18, 24
+; SI-NEXT: s_lshl_b32 s78, s16, 24
; SI-NEXT: s_and_b32 s76, s76, 0xffff
-; SI-NEXT: s_and_b32 s44, s29, 0xffff
-; SI-NEXT: s_or_b32 s7, s7, s37
-; SI-NEXT: s_or_b32 s9, s9, s68
-; SI-NEXT: s_or_b32 s11, s11, s99
-; SI-NEXT: s_or_b32 s13, s13, s92
-; SI-NEXT: s_or_b32 s15, s15, s90
-; SI-NEXT: s_or_b32 s43, s43, s35
-; SI-NEXT: v_writelane_b32 v42, s30, 59
-; SI-NEXT: s_mov_b32 s23, s91
-; SI-NEXT: s_mov_b32 s91, s36
+; SI-NEXT: s_and_b32 s44, s44, 0xffff
+; SI-NEXT: v_writelane_b32 v42, s70, 61
; SI-NEXT: s_or_b32 s66, s78, s77
; SI-NEXT: s_or_b32 s67, s76, s85
-; SI-NEXT: s_and_b32 s45, s45, 0xffff
-; SI-NEXT: s_and_b32 s46, s46, 0xffff
-; SI-NEXT: s_and_b32 s47, s47, 0xffff
-; SI-NEXT: s_and_b32 s56, s56, 0xffff
-; SI-NEXT: s_and_b32 s57, s57, 0xffff
-; SI-NEXT: s_and_b32 s30, s58, 0xffff
-; SI-NEXT: s_and_b32 s34, s59, 0xffff
-; SI-NEXT: s_and_b32 s36, s60, 0xffff
-; SI-NEXT: s_and_b32 s97, s61, 0xffff
-; SI-NEXT: s_and_b32 s86, s62, 0xffff
-; SI-NEXT: s_and_b32 s98, s63, 0xffff
-; SI-NEXT: s_and_b32 s17, s72, 0xffff
-; SI-NEXT: s_and_b32 s87, s73, 0xffff
; SI-NEXT: s_and_b32 s96, s74, 0xffff
; SI-NEXT: s_and_b32 s22, s75, 0xffff
; SI-NEXT: s_or_b32 s74, s44, s4
; SI-NEXT: s_mov_b32 s75, s5
; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16
-; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16
-; SI-NEXT: s_mov_b32 s70, s93
-; SI-NEXT: s_mov_b32 s69, s95
-; SI-NEXT: s_mov_b32 s93, s28
-; SI-NEXT: s_or_b32 s72, s45, s6
-; SI-NEXT: s_mov_b32 s73, s7
-; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16
+; SI-NEXT: s_lshr_b64 s[4:5], vcc, 16
+; SI-NEXT: v_writelane_b32 v42, s4, 50
+; SI-NEXT: s_and_b32 s46, s46, 0xffff
+; SI-NEXT: s_and_b32 s87, s73, 0xffff
+; SI-NEXT: v_writelane_b32 v42, s5, 51
+; SI-NEXT: s_and_b32 s13, s13, 0xffff
+; SI-NEXT: s_and_b32 s47, s47, 0xffff
+; SI-NEXT: s_and_b32 s86, s62, 0xffff
+; SI-NEXT: s_and_b32 s98, s63, 0xffff
+; SI-NEXT: s_and_b32 s21, s72, 0xffff
; SI-NEXT: s_or_b32 s62, s46, s8
; SI-NEXT: s_mov_b32 s63, s9
; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16
+; SI-NEXT: s_or_b32 s8, s87, s54
+; SI-NEXT: s_mov_b32 s9, s55
+; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16
+; SI-NEXT: v_readlane_b32 s16, v42, 52
+; SI-NEXT: s_or_b32 s13, s13, s99
+; SI-NEXT: s_and_b32 s15, s15, 0xffff
+; SI-NEXT: s_and_b32 s41, s41, 0xffff
+; SI-NEXT: s_and_b32 s56, s56, 0xffff
+; SI-NEXT: s_and_b32 s36, s60, 0xffff
+; SI-NEXT: s_and_b32 s97, s61, 0xffff
; SI-NEXT: s_or_b32 s60, s47, s10
; SI-NEXT: s_mov_b32 s61, s11
-; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 16
+; SI-NEXT: s_lshr_b64 s[78:79], s[10:11], 16
+; SI-NEXT: s_or_b32 s10, s21, s52
+; SI-NEXT: s_mov_b32 s11, s53
+; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16
+; SI-NEXT: s_lshr_b32 s55, s16, 16
+; SI-NEXT: v_readlane_b32 s16, v42, 53
+; SI-NEXT: s_or_b32 s15, s15, s24
+; SI-NEXT: s_or_b32 s41, s41, s25
+; SI-NEXT: s_and_b32 s45, s45, 0xffff
+; SI-NEXT: s_and_b32 s57, s57, 0xffff
+; SI-NEXT: s_and_b32 s30, s58, 0xffff
+; SI-NEXT: s_and_b32 s34, s59, 0xffff
; SI-NEXT: s_or_b32 s58, s56, s12
; SI-NEXT: s_mov_b32 s59, s13
-; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16
+; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16
+; SI-NEXT: s_or_b32 s12, s98, s50
+; SI-NEXT: s_mov_b32 s13, s51
+; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16
+; SI-NEXT: s_lshr_b32 s53, s16, 16
+; SI-NEXT: v_readlane_b32 s16, v42, 54
+; SI-NEXT: s_mov_b32 s70, s69
+; SI-NEXT: s_mov_b32 s69, s68
+; SI-NEXT: s_mov_b32 s68, s37
+; SI-NEXT: s_mov_b32 s37, s95
+; SI-NEXT: s_or_b32 s72, s45, s6
+; SI-NEXT: s_mov_b32 s73, s7
+; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16
; SI-NEXT: s_or_b32 s56, s57, s14
; SI-NEXT: s_mov_b32 s57, s15
-; SI-NEXT: s_lshr_b64 s[24:25], s[14:15], 16
+; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16
; SI-NEXT: s_or_b32 s46, s30, s40
; SI-NEXT: s_mov_b32 s47, s41
-; SI-NEXT: s_or_b32 s44, s34, s42
-; SI-NEXT: s_mov_b32 s34, s4
-; SI-NEXT: s_mov_b32 s45, s43
-; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16
-; SI-NEXT: s_or_b32 s42, s36, vcc_lo
-; SI-NEXT: s_mov_b32 s43, vcc_hi
-; SI-NEXT: s_lshr_b64 vcc, vcc, 16
+; SI-NEXT: s_lshr_b64 s[30:31], s[40:41], 16
; SI-NEXT: s_or_b32 s40, s97, s38
; SI-NEXT: s_mov_b32 s41, s39
; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 16
; SI-NEXT: s_or_b32 s14, s86, s48
; SI-NEXT: s_mov_b32 s15, s49
; SI-NEXT: s_lshr_b64 s[48:49], s[48:49], 16
-; SI-NEXT: s_or_b32 s12, s98, s50
-; SI-NEXT: s_mov_b32 s13, s51
-; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16
-; SI-NEXT: s_or_b32 s10, s17, s52
-; SI-NEXT: s_mov_b32 s11, s53
-; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16
-; SI-NEXT: s_or_b32 s8, s87, s54
-; SI-NEXT: s_mov_b32 s9, s55
-; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16
; SI-NEXT: s_or_b32 s6, s96, s64
; SI-NEXT: s_mov_b32 s7, s65
; SI-NEXT: s_lshr_b64 s[64:65], s[64:65], 16
; SI-NEXT: s_or_b32 s4, s22, s66
; SI-NEXT: s_mov_b32 s5, s67
; SI-NEXT: s_lshr_b64 s[66:67], s[66:67], 16
-; SI-NEXT: v_readlane_b32 s17, v42, 51
-; SI-NEXT: s_lshr_b32 s55, s17, 16
-; SI-NEXT: s_lshr_b32 s53, s37, 16
-; SI-NEXT: s_lshr_b32 s51, s68, 16
-; SI-NEXT: s_lshr_b32 s49, s99, 16
-; SI-NEXT: s_lshr_b32 s86, s92, 16
-; SI-NEXT: s_lshr_b32 s39, s90, 16
-; SI-NEXT: s_lshr_b32 s18, s31, 16
-; SI-NEXT: s_lshr_b32 s22, s35, 16
-; SI-NEXT: s_lshr_b32 s97, s19, 16
+; SI-NEXT: s_lshr_b32 s51, s16, 16
+; SI-NEXT: v_readlane_b32 s16, v42, 55
+; SI-NEXT: s_or_b32 s44, s34, s42
+; SI-NEXT: s_mov_b32 s45, s43
+; SI-NEXT: s_lshr_b64 s[34:35], s[42:43], 16
+; SI-NEXT: s_or_b32 s42, s36, vcc_lo
+; SI-NEXT: s_mov_b32 s43, vcc_hi
+; SI-NEXT: s_lshr_b32 s49, s16, 16
+; SI-NEXT: s_lshr_b32 s86, s99, 16
+; SI-NEXT: s_lshr_b32 s39, s24, 16
+; SI-NEXT: s_lshr_b32 s18, s25, 16
+; SI-NEXT: s_lshr_b32 s22, s23, 16
+; SI-NEXT: s_lshr_b32 s97, s17, 16
; SI-NEXT: s_lshr_b32 s65, s71, 16
-; SI-NEXT: s_lshr_b32 s19, s80, 16
-; SI-NEXT: s_lshr_b32 s71, s81, 16
+; SI-NEXT: s_lshr_b32 s71, s80, 16
+; SI-NEXT: s_lshr_b32 s21, s81, 16
; SI-NEXT: s_lshr_b32 s67, s82, 16
-; SI-NEXT: v_readlane_b32 s82, v42, 54
-; SI-NEXT: v_readlane_b32 s96, v42, 53
-; SI-NEXT: s_lshr_b32 s80, s16, 16
-; SI-NEXT: v_readlane_b32 s16, v42, 55
+; SI-NEXT: v_readlane_b32 s82, v42, 58
+; SI-NEXT: v_readlane_b32 s96, v42, 57
+; SI-NEXT: s_lshr_b32 s80, s84, 16
+; SI-NEXT: v_readlane_b32 s84, v42, 59
; SI-NEXT: s_lshr_b32 s81, s83, 16
-; SI-NEXT: s_mov_b32 s90, s93
-; SI-NEXT: v_readlane_b32 s78, v42, 52
-; SI-NEXT: s_mov_b32 s95, s69
-; SI-NEXT: s_mov_b32 s93, s70
-; SI-NEXT: v_readlane_b32 s30, v42, 59
-; SI-NEXT: v_readlane_b32 s69, v42, 58
-; SI-NEXT: v_readlane_b32 s70, v42, 57
-; SI-NEXT: v_readlane_b32 s89, v42, 56
+; SI-NEXT: s_mov_b32 s95, s37
+; SI-NEXT: s_mov_b32 s37, s68
+; SI-NEXT: s_mov_b32 s68, s69
+; SI-NEXT: s_mov_b32 s69, s70
+; SI-NEXT: v_readlane_b32 s70, v42, 61
+; SI-NEXT: v_readlane_b32 s89, v42, 60
; SI-NEXT: s_lshr_b32 s77, s85, 16
-; SI-NEXT: s_mov_b32 s84, vcc_lo
-; SI-NEXT: s_mov_b32 s36, s91
-; SI-NEXT: s_mov_b32 s91, s23
+; SI-NEXT: v_readlane_b32 s24, v42, 56
; SI-NEXT: s_cbranch_execnz .LBB97_3
; SI-NEXT: .LBB97_2: ; %cmp.true
-; SI-NEXT: v_readlane_b32 s4, v43, 42
+; SI-NEXT: v_readlane_b32 s4, v43, 41
; SI-NEXT: s_add_i32 s4, s4, 3
-; SI-NEXT: v_readlane_b32 s6, v43, 41
+; SI-NEXT: v_readlane_b32 s6, v43, 19
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: s_lshl_b32 s5, s90, 8
+; SI-NEXT: s_lshl_b32 s5, s24, 8
; SI-NEXT: s_add_i32 s6, s6, 3
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: v_readlane_b32 s5, v43, 18
@@ -200989,14 +200995,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s6, s6, s7
; SI-NEXT: s_and_b32 s5, s5, 0xffff
; SI-NEXT: s_or_b32 s5, s6, s5
-; SI-NEXT: v_readlane_b32 s6, v43, 44
+; SI-NEXT: v_readlane_b32 s6, v43, 43
; SI-NEXT: s_add_i32 s6, s6, 3
-; SI-NEXT: v_readlane_b32 s8, v43, 24
+; SI-NEXT: v_readlane_b32 s7, v43, 39
+; SI-NEXT: v_readlane_b32 s8, v43, 25
; SI-NEXT: s_and_b32 s6, s6, 0xff
-; SI-NEXT: s_lshl_b32 s7, s91, 8
+; SI-NEXT: s_lshl_b32 s7, s7, 8
; SI-NEXT: s_add_i32 s8, s8, 3
; SI-NEXT: s_or_b32 s6, s7, s6
-; SI-NEXT: v_readlane_b32 s7, v43, 23
+; SI-NEXT: v_readlane_b32 s7, v43, 24
; SI-NEXT: s_and_b32 s8, s8, 0xff
; SI-NEXT: s_lshl_b32 s7, s7, 24
; SI-NEXT: s_lshl_b32 s8, s8, 16
@@ -201004,15 +201011,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s7, s7, s8
; SI-NEXT: s_and_b32 s6, s6, 0xffff
; SI-NEXT: s_or_b32 s6, s7, s6
-; SI-NEXT: v_readlane_b32 s7, v43, 22
+; SI-NEXT: v_readlane_b32 s7, v43, 23
; SI-NEXT: s_add_i32 s7, s7, 3
-; SI-NEXT: v_readlane_b32 s8, v43, 21
-; SI-NEXT: v_readlane_b32 s9, v43, 20
+; SI-NEXT: v_readlane_b32 s8, v43, 22
+; SI-NEXT: v_readlane_b32 s9, v43, 21
; SI-NEXT: s_and_b32 s7, s7, 0xff
; SI-NEXT: s_lshl_b32 s8, s8, 8
; SI-NEXT: s_add_i32 s9, s9, 3
; SI-NEXT: s_or_b32 s7, s8, s7
-; SI-NEXT: v_readlane_b32 s8, v43, 19
+; SI-NEXT: v_readlane_b32 s8, v43, 20
; SI-NEXT: s_and_b32 s9, s9, 0xff
; SI-NEXT: s_lshl_b32 s8, s8, 24
; SI-NEXT: s_lshl_b32 s9, s9, 16
@@ -201020,29 +201027,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: s_and_b32 s7, s7, 0xffff
; SI-NEXT: s_or_b32 s7, s8, s7
-; SI-NEXT: v_readlane_b32 s8, v43, 43
+; SI-NEXT: v_readlane_b32 s8, v43, 42
; SI-NEXT: s_add_i32 s8, s8, 3
+; SI-NEXT: v_readlane_b32 s9, v43, 38
; SI-NEXT: s_and_b32 s8, s8, 0xff
-; SI-NEXT: s_lshl_b32 s9, s78, 8
-; SI-NEXT: s_add_i32 s10, s89, 3
+; SI-NEXT: s_lshl_b32 s9, s9, 8
+; SI-NEXT: s_add_i32 s10, s70, 3
; SI-NEXT: s_or_b32 s8, s9, s8
-; SI-NEXT: v_readlane_b32 s9, v43, 29
; SI-NEXT: s_and_b32 s10, s10, 0xff
-; SI-NEXT: s_lshl_b32 s9, s9, 24
+; SI-NEXT: s_lshl_b32 s9, s89, 24
; SI-NEXT: s_lshl_b32 s10, s10, 16
; SI-NEXT: s_addk_i32 s8, 0x300
; SI-NEXT: s_or_b32 s9, s9, s10
; SI-NEXT: s_and_b32 s8, s8, 0xffff
; SI-NEXT: s_or_b32 s8, s9, s8
-; SI-NEXT: v_readlane_b32 s9, v43, 28
+; SI-NEXT: v_readlane_b32 s9, v43, 29
; SI-NEXT: s_add_i32 s9, s9, 3
-; SI-NEXT: v_readlane_b32 s10, v43, 27
-; SI-NEXT: v_readlane_b32 s11, v43, 26
+; SI-NEXT: v_readlane_b32 s10, v43, 28
+; SI-NEXT: v_readlane_b32 s11, v43, 27
; SI-NEXT: s_and_b32 s9, s9, 0xff
; SI-NEXT: s_lshl_b32 s10, s10, 8
; SI-NEXT: s_add_i32 s11, s11, 3
; SI-NEXT: s_or_b32 s9, s10, s9
-; SI-NEXT: v_readlane_b32 s10, v43, 25
+; SI-NEXT: v_readlane_b32 s10, v43, 26
; SI-NEXT: s_and_b32 s11, s11, 0xff
; SI-NEXT: s_lshl_b32 s10, s10, 24
; SI-NEXT: s_lshl_b32 s11, s11, 16
@@ -201050,23 +201057,22 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s10, s10, s11
; SI-NEXT: s_and_b32 s9, s9, 0xffff
; SI-NEXT: s_or_b32 s9, s10, s9
-; SI-NEXT: v_readlane_b32 s10, v43, 46
+; SI-NEXT: v_readlane_b32 s10, v43, 45
; SI-NEXT: s_add_i32 s10, s10, 3
-; SI-NEXT: v_readlane_b32 s11, v43, 45
-; SI-NEXT: v_readlane_b32 s12, v43, 32
+; SI-NEXT: v_readlane_b32 s11, v43, 44
+; SI-NEXT: v_readlane_b32 s12, v43, 40
; SI-NEXT: s_and_b32 s10, s10, 0xff
; SI-NEXT: s_lshl_b32 s11, s11, 8
; SI-NEXT: s_add_i32 s12, s12, 3
; SI-NEXT: s_or_b32 s10, s11, s10
; SI-NEXT: s_and_b32 s12, s12, 0xff
-; SI-NEXT: s_lshl_b32 s11, s70, 24
+; SI-NEXT: s_lshl_b32 s11, s69, 24
; SI-NEXT: s_lshl_b32 s12, s12, 16
; SI-NEXT: s_addk_i32 s10, 0x300
; SI-NEXT: s_or_b32 s11, s11, s12
; SI-NEXT: s_and_b32 s10, s10, 0xffff
; SI-NEXT: s_or_b32 s10, s11, s10
-; SI-NEXT: v_readlane_b32 s11, v43, 31
-; SI-NEXT: s_add_i32 s11, s11, 3
+; SI-NEXT: s_add_i32 s11, s92, 3
; SI-NEXT: v_readlane_b32 s13, v43, 30
; SI-NEXT: s_and_b32 s11, s11, 0xff
; SI-NEXT: s_lshl_b32 s12, s96, 8
@@ -201079,27 +201085,26 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s12, s12, s13
; SI-NEXT: s_and_b32 s11, s11, 0xffff
; SI-NEXT: s_or_b32 s11, s12, s11
-; SI-NEXT: s_add_i32 s12, s36, 3
+; SI-NEXT: v_readlane_b32 s12, v43, 37
+; SI-NEXT: s_add_i32 s12, s12, 3
; SI-NEXT: s_and_b32 s12, s12, 0xff
-; SI-NEXT: s_lshl_b32 s13, s16, 8
-; SI-NEXT: s_add_i32 s14, s30, 3
+; SI-NEXT: s_lshl_b32 s13, s84, 8
+; SI-NEXT: s_add_i32 s14, s37, 3
; SI-NEXT: s_or_b32 s12, s13, s12
; SI-NEXT: s_and_b32 s14, s14, 0xff
-; SI-NEXT: s_lshl_b32 s13, s69, 24
+; SI-NEXT: s_lshl_b32 s13, s68, 24
; SI-NEXT: s_lshl_b32 s14, s14, 16
; SI-NEXT: s_addk_i32 s12, 0x300
; SI-NEXT: s_or_b32 s13, s13, s14
; SI-NEXT: s_and_b32 s12, s12, 0xffff
; SI-NEXT: s_or_b32 s12, s13, s12
-; SI-NEXT: v_readlane_b32 s13, v43, 36
-; SI-NEXT: s_add_i32 s13, s13, 3
-; SI-NEXT: v_readlane_b32 s14, v43, 35
-; SI-NEXT: v_readlane_b32 s15, v43, 34
+; SI-NEXT: s_add_i32 s13, s91, 3
+; SI-NEXT: v_readlane_b32 s15, v43, 32
; SI-NEXT: s_and_b32 s13, s13, 0xff
-; SI-NEXT: s_lshl_b32 s14, s14, 8
+; SI-NEXT: s_lshl_b32 s14, s90, 8
; SI-NEXT: s_add_i32 s15, s15, 3
; SI-NEXT: s_or_b32 s13, s14, s13
-; SI-NEXT: v_readlane_b32 s14, v43, 33
+; SI-NEXT: v_readlane_b32 s14, v43, 31
; SI-NEXT: s_and_b32 s15, s15, 0xff
; SI-NEXT: s_lshl_b32 s14, s14, 24
; SI-NEXT: s_lshl_b32 s15, s15, 16
@@ -201107,29 +201112,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: s_and_b32 s13, s13, 0xffff
; SI-NEXT: s_or_b32 s13, s14, s13
-; SI-NEXT: v_readlane_b32 s14, v42, 50
+; SI-NEXT: v_readlane_b32 s14, v42, 49
; SI-NEXT: s_add_i32 s17, s14, 3
-; SI-NEXT: v_readlane_b32 s15, v42, 49
+; SI-NEXT: v_readlane_b32 s15, v42, 48
; SI-NEXT: s_and_b32 s14, s17, 0xff
; SI-NEXT: s_lshl_b32 s15, s15, 8
-; SI-NEXT: s_add_i32 s16, s95, 3
+; SI-NEXT: s_add_i32 s16, s93, 3
; SI-NEXT: s_or_b32 s14, s15, s14
; SI-NEXT: s_and_b32 s16, s16, 0xff
-; SI-NEXT: s_lshl_b32 s15, s93, 24
+; SI-NEXT: s_lshl_b32 s15, s95, 24
; SI-NEXT: s_lshl_b32 s16, s16, 16
; SI-NEXT: s_addk_i32 s14, 0x300
; SI-NEXT: s_or_b32 s15, s15, s16
; SI-NEXT: s_and_b32 s14, s14, 0xffff
; SI-NEXT: s_or_b32 s14, s15, s14
-; SI-NEXT: v_readlane_b32 s15, v43, 40
+; SI-NEXT: v_readlane_b32 s15, v43, 36
; SI-NEXT: s_add_i32 s15, s15, 3
-; SI-NEXT: v_readlane_b32 s16, v43, 39
-; SI-NEXT: v_readlane_b32 s17, v43, 38
+; SI-NEXT: v_readlane_b32 s16, v43, 35
+; SI-NEXT: v_readlane_b32 s17, v43, 34
; SI-NEXT: s_and_b32 s15, s15, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 8
; SI-NEXT: s_add_i32 s17, s17, 3
; SI-NEXT: s_or_b32 s15, s16, s15
-; SI-NEXT: v_readlane_b32 s16, v43, 37
+; SI-NEXT: v_readlane_b32 s16, v43, 33
; SI-NEXT: s_and_b32 s17, s17, 0xff
; SI-NEXT: s_lshl_b32 s16, s16, 24
; SI-NEXT: s_lshl_b32 s17, s17, 16
@@ -201137,15 +201142,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: s_and_b32 s15, s15, 0xffff
; SI-NEXT: s_or_b32 s15, s16, s15
-; SI-NEXT: v_readlane_b32 s16, v42, 48
+; SI-NEXT: v_readlane_b32 s16, v42, 47
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 47
-; SI-NEXT: v_readlane_b32 s18, v42, 42
+; SI-NEXT: v_readlane_b32 s17, v42, 46
+; SI-NEXT: v_readlane_b32 s18, v42, 41
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s99, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 43
+; SI-NEXT: v_readlane_b32 s17, v42, 42
; SI-NEXT: s_and_b32 s18, s99, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 24
; SI-NEXT: s_lshl_b32 s18, s18, 16
@@ -201153,15 +201158,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_and_b32 s16, s16, 0xffff
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 38
+; SI-NEXT: v_readlane_b32 s17, v42, 37
; SI-NEXT: s_add_i32 s87, s17, 3
-; SI-NEXT: v_readlane_b32 s18, v42, 35
-; SI-NEXT: v_readlane_b32 s19, v42, 29
+; SI-NEXT: v_readlane_b32 s18, v42, 34
+; SI-NEXT: v_readlane_b32 s19, v42, 28
; SI-NEXT: s_and_b32 s17, s87, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 8
; SI-NEXT: s_add_i32 s23, s19, 3
; SI-NEXT: s_or_b32 s17, s18, s17
-; SI-NEXT: v_readlane_b32 s18, v42, 30
+; SI-NEXT: v_readlane_b32 s18, v42, 29
; SI-NEXT: s_and_b32 s23, s23, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 24
; SI-NEXT: s_lshl_b32 s23, s23, 16
@@ -201170,16 +201175,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_and_b32 s17, s17, 0xffff
; SI-NEXT: s_or_b32 s17, s18, s17
; SI-NEXT: s_add_i32 s40, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v42, 45
+; SI-NEXT: v_readlane_b32 s16, v42, 44
; SI-NEXT: s_add_i32 s41, s17, 0x3000000
; SI-NEXT: s_add_i32 s68, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 44
-; SI-NEXT: v_readlane_b32 s18, v42, 39
+; SI-NEXT: v_readlane_b32 s17, v42, 43
+; SI-NEXT: v_readlane_b32 s18, v42, 38
; SI-NEXT: s_and_b32 s16, s68, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s96, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 40
+; SI-NEXT: v_readlane_b32 s17, v42, 39
; SI-NEXT: s_and_b32 s18, s96, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 24
; SI-NEXT: s_lshl_b32 s18, s18, 16
@@ -201187,33 +201192,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_and_b32 s16, s16, 0xffff
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 33
+; SI-NEXT: v_readlane_b32 s17, v42, 32
; SI-NEXT: s_add_i32 s17, s17, 3
-; SI-NEXT: v_readlane_b32 s18, v42, 31
+; SI-NEXT: v_readlane_b32 s18, v42, 30
; SI-NEXT: s_and_b32 s17, s17, 0xff
; SI-NEXT: s_lshl_b32 s18, s18, 8
; SI-NEXT: s_or_b32 s17, s18, s17
-; SI-NEXT: v_readlane_b32 s18, v42, 24
+; SI-NEXT: v_readlane_b32 s18, v42, 23
; SI-NEXT: s_addk_i32 s17, 0x300
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_add_i32 s42, s16, 0x3000000
; SI-NEXT: s_and_b32 s16, s17, 0xffff
-; SI-NEXT: v_readlane_b32 s17, v42, 25
+; SI-NEXT: v_readlane_b32 s17, v42, 24
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 24
; SI-NEXT: s_lshl_b32 s18, s18, 16
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s43, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v42, 46
+; SI-NEXT: v_readlane_b32 s16, v42, 45
; SI-NEXT: s_add_i32 s23, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 41
-; SI-NEXT: v_readlane_b32 s18, v42, 37
+; SI-NEXT: v_readlane_b32 s17, v42, 40
+; SI-NEXT: v_readlane_b32 s18, v42, 36
; SI-NEXT: s_and_b32 s16, s23, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s86, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 36
+; SI-NEXT: v_readlane_b32 s17, v42, 35
; SI-NEXT: s_and_b32 s18, s86, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201222,15 +201227,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s44, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v42, 28
+; SI-NEXT: v_readlane_b32 s16, v42, 27
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 22
-; SI-NEXT: v_readlane_b32 s18, v42, 16
+; SI-NEXT: v_readlane_b32 s17, v42, 21
+; SI-NEXT: v_readlane_b32 s18, v42, 15
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 15
+; SI-NEXT: v_readlane_b32 s17, v42, 14
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201239,15 +201244,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s45, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v42, 34
+; SI-NEXT: v_readlane_b32 s16, v42, 33
; SI-NEXT: s_add_i32 s83, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 32
-; SI-NEXT: v_readlane_b32 s18, v42, 26
+; SI-NEXT: v_readlane_b32 s17, v42, 31
+; SI-NEXT: v_readlane_b32 s18, v42, 25
; SI-NEXT: s_and_b32 s16, s83, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 27
+; SI-NEXT: v_readlane_b32 s17, v42, 26
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201256,15 +201261,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s46, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v42, 23
+; SI-NEXT: v_readlane_b32 s16, v42, 22
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 21
-; SI-NEXT: v_readlane_b32 s18, v42, 17
+; SI-NEXT: v_readlane_b32 s17, v42, 20
+; SI-NEXT: v_readlane_b32 s18, v42, 16
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 18
+; SI-NEXT: v_readlane_b32 s17, v42, 17
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201273,15 +201278,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s47, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v42, 20
+; SI-NEXT: v_readlane_b32 s16, v42, 19
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 19
-; SI-NEXT: v_readlane_b32 s18, v42, 13
+; SI-NEXT: v_readlane_b32 s17, v42, 18
+; SI-NEXT: v_readlane_b32 s18, v42, 12
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 14
+; SI-NEXT: v_readlane_b32 s17, v42, 13
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201290,15 +201295,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s56, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v42, 12
+; SI-NEXT: v_readlane_b32 s16, v42, 11
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 11
-; SI-NEXT: v_readlane_b32 s18, v42, 7
+; SI-NEXT: v_readlane_b32 s17, v42, 10
+; SI-NEXT: v_readlane_b32 s18, v42, 6
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 8
+; SI-NEXT: v_readlane_b32 s17, v42, 7
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201307,15 +201312,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s57, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v42, 10
+; SI-NEXT: v_readlane_b32 s16, v42, 9
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 9
-; SI-NEXT: v_readlane_b32 s18, v42, 5
+; SI-NEXT: v_readlane_b32 s17, v42, 8
+; SI-NEXT: v_readlane_b32 s18, v42, 4
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 6
+; SI-NEXT: v_readlane_b32 s17, v42, 5
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201324,15 +201329,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s58, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v42, 4
+; SI-NEXT: v_readlane_b32 s16, v42, 3
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 3
-; SI-NEXT: v_readlane_b32 s18, v43, 63
+; SI-NEXT: v_readlane_b32 s17, v42, 2
+; SI-NEXT: v_readlane_b32 s18, v43, 62
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v42, 0
+; SI-NEXT: v_readlane_b32 s17, v43, 63
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201341,15 +201346,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s59, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v42, 2
+; SI-NEXT: v_readlane_b32 s16, v42, 1
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v42, 1
-; SI-NEXT: v_readlane_b32 s18, v43, 61
+; SI-NEXT: v_readlane_b32 s17, v42, 0
+; SI-NEXT: v_readlane_b32 s18, v43, 60
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v43, 62
+; SI-NEXT: v_readlane_b32 s17, v43, 61
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201358,15 +201363,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s60, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v43, 60
+; SI-NEXT: v_readlane_b32 s16, v43, 59
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v43, 59
-; SI-NEXT: v_readlane_b32 s18, v43, 55
+; SI-NEXT: v_readlane_b32 s17, v43, 58
+; SI-NEXT: v_readlane_b32 s18, v43, 54
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v43, 56
+; SI-NEXT: v_readlane_b32 s17, v43, 55
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201375,15 +201380,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s61, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v43, 58
+; SI-NEXT: v_readlane_b32 s16, v43, 57
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v43, 57
-; SI-NEXT: v_readlane_b32 s18, v43, 53
+; SI-NEXT: v_readlane_b32 s17, v43, 56
+; SI-NEXT: v_readlane_b32 s18, v43, 52
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v43, 54
+; SI-NEXT: v_readlane_b32 s17, v43, 53
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201392,15 +201397,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_add_i32 s62, s16, 0x3000000
-; SI-NEXT: v_readlane_b32 s16, v43, 52
+; SI-NEXT: v_readlane_b32 s16, v43, 51
; SI-NEXT: s_add_i32 s16, s16, 3
-; SI-NEXT: v_readlane_b32 s17, v43, 51
-; SI-NEXT: v_readlane_b32 s18, v43, 49
+; SI-NEXT: v_readlane_b32 s17, v43, 50
+; SI-NEXT: v_readlane_b32 s18, v43, 48
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v43, 50
+; SI-NEXT: v_readlane_b32 s17, v43, 49
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201429,12 +201434,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_readlane_b32 s16, v43, 1
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: v_readlane_b32 s17, v43, 0
-; SI-NEXT: v_readlane_b32 s18, v43, 47
+; SI-NEXT: v_readlane_b32 s18, v43, 46
; SI-NEXT: s_and_b32 s16, s16, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
; SI-NEXT: s_add_i32 s18, s18, 3
; SI-NEXT: s_or_b32 s16, s17, s16
-; SI-NEXT: v_readlane_b32 s17, v43, 48
+; SI-NEXT: v_readlane_b32 s17, v43, 47
; SI-NEXT: s_and_b32 s18, s18, 0xff
; SI-NEXT: s_addk_i32 s16, 0x300
; SI-NEXT: s_lshl_b32 s17, s17, 24
@@ -201490,6 +201495,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_add_i32 s15, s15, 0x3000000
; SI-NEXT: s_add_i32 s75, s16, 0x3000000
; SI-NEXT: s_lshr_b64 s[76:77], s[74:75], 16
+; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16
; SI-NEXT: s_lshr_b64 s[38:39], s[40:41], 16
; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 16
; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16
@@ -201499,12 +201505,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16
; SI-NEXT: s_lshr_b64 s[26:27], s[72:73], 16
; SI-NEXT: s_lshr_b64 s[28:29], s[62:63], 16
-; SI-NEXT: s_lshr_b64 s[88:89], s[60:61], 16
-; SI-NEXT: s_lshr_b64 s[20:21], s[58:59], 16
-; SI-NEXT: s_lshr_b64 s[24:25], s[56:57], 16
-; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16
-; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16
-; SI-NEXT: s_lshr_b64 s[84:85], s[42:43], 16
+; SI-NEXT: s_lshr_b64 s[78:79], s[60:61], 16
+; SI-NEXT: s_lshr_b64 s[88:89], s[58:59], 16
+; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16
+; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16
+; SI-NEXT: s_lshr_b64 s[34:35], s[44:45], 16
+; SI-NEXT: v_writelane_b32 v42, s16, 50
; SI-NEXT: s_lshr_b32 s55, s75, 16
; SI-NEXT: s_lshr_b32 s53, s73, 16
; SI-NEXT: s_lshr_b32 s51, s63, 16
@@ -201515,12 +201521,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_lshr_b32 s22, s45, 16
; SI-NEXT: s_lshr_b32 s97, s43, 16
; SI-NEXT: s_lshr_b32 s65, s41, 16
-; SI-NEXT: s_lshr_b32 s19, s15, 16
-; SI-NEXT: s_lshr_b32 s71, s13, 16
+; SI-NEXT: s_lshr_b32 s71, s15, 16
+; SI-NEXT: s_lshr_b32 s21, s13, 16
; SI-NEXT: s_lshr_b32 s67, s11, 16
; SI-NEXT: s_lshr_b32 s80, s9, 16
; SI-NEXT: s_lshr_b32 s81, s7, 16
; SI-NEXT: s_lshr_b32 s77, s5, 16
+; SI-NEXT: v_writelane_b32 v42, s17, 51
; SI-NEXT: .LBB97_3: ; %end
; SI-NEXT: s_and_b32 s16, s74, 0xffff
; SI-NEXT: s_lshl_b32 s17, s76, 16
@@ -201561,7 +201568,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: s_and_b32 s16, s60, 0xffff
-; SI-NEXT: s_lshl_b32 s17, s88, 16
+; SI-NEXT: s_lshl_b32 s17, s78, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
@@ -201575,7 +201582,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: s_and_b32 s16, s58, 0xffff
-; SI-NEXT: s_lshl_b32 s17, s20, 16
+; SI-NEXT: s_lshl_b32 s17, s88, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
@@ -201589,7 +201596,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: s_and_b32 s16, s56, 0xffff
-; SI-NEXT: s_lshl_b32 s17, s24, 16
+; SI-NEXT: s_lshl_b32 s17, s94, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
@@ -201603,7 +201610,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: s_and_b32 s16, s46, 0xffff
-; SI-NEXT: s_lshl_b32 s17, s34, 16
+; SI-NEXT: s_lshl_b32 s17, s30, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
@@ -201617,7 +201624,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: s_and_b32 s16, s44, 0xffff
-; SI-NEXT: s_lshl_b32 s17, s94, 16
+; SI-NEXT: s_lshl_b32 s17, s34, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
@@ -201627,11 +201634,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_lshl_b32 s17, s22, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0
; SI-NEXT: s_or_b32 s16, s16, s17
+; SI-NEXT: v_readlane_b32 s18, v42, 50
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: s_and_b32 s16, s42, 0xffff
-; SI-NEXT: s_lshl_b32 s17, s84, 16
+; SI-NEXT: s_lshl_b32 s17, s18, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
@@ -201666,7 +201674,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: s_and_b32 s14, s15, 0xffff
-; SI-NEXT: s_lshl_b32 s15, s19, 16
+; SI-NEXT: s_lshl_b32 s15, s71, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0
; SI-NEXT: s_or_b32 s14, s14, s15
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
@@ -201680,7 +201688,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s12
; SI-NEXT: s_and_b32 s12, s13, 0xffff
-; SI-NEXT: s_lshl_b32 s13, s71, 16
+; SI-NEXT: s_lshl_b32 s13, s21, 16
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0
; SI-NEXT: s_or_b32 s12, s12, s13
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
@@ -201744,6 +201752,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT: v_readlane_b32 s19, v42, 51
; SI-NEXT: v_readlane_b32 s99, v41, 35
; SI-NEXT: v_readlane_b32 s98, v41, 34
; SI-NEXT: v_readlane_b32 s97, v41, 33
@@ -201788,6 +201797,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB97_4:
+; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v42, s4, 50
; SI-NEXT: ; implicit-def: $sgpr74
; SI-NEXT: ; implicit-def: $sgpr76
; SI-NEXT: ; implicit-def: $sgpr55
@@ -201798,32 +201809,32 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr28
; SI-NEXT: ; implicit-def: $sgpr51
; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr88
+; SI-NEXT: ; implicit-def: $sgpr78
; SI-NEXT: ; implicit-def: $sgpr49
; SI-NEXT: ; implicit-def: $sgpr58
-; SI-NEXT: ; implicit-def: $sgpr20
+; SI-NEXT: ; implicit-def: $sgpr88
; SI-NEXT: ; implicit-def: $sgpr86
; SI-NEXT: ; implicit-def: $sgpr56
-; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr94
; SI-NEXT: ; implicit-def: $sgpr39
; SI-NEXT: ; implicit-def: $sgpr46
-; SI-NEXT: ; implicit-def: $sgpr34
+; SI-NEXT: ; implicit-def: $sgpr30
; SI-NEXT: ; implicit-def: $sgpr18
; SI-NEXT: ; implicit-def: $sgpr44
-; SI-NEXT: ; implicit-def: $sgpr94
+; SI-NEXT: ; implicit-def: $sgpr34
; SI-NEXT: ; implicit-def: $sgpr22
; SI-NEXT: ; implicit-def: $sgpr42
-; SI-NEXT: ; implicit-def: $sgpr84
+; SI-NEXT: v_writelane_b32 v42, s5, 51
; SI-NEXT: ; implicit-def: $sgpr97
; SI-NEXT: ; implicit-def: $sgpr40
; SI-NEXT: ; implicit-def: $sgpr38
; SI-NEXT: ; implicit-def: $sgpr65
; SI-NEXT: ; implicit-def: $sgpr14
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr19
+; SI-NEXT: ; implicit-def: $sgpr71
; SI-NEXT: ; implicit-def: $sgpr12
; SI-NEXT: ; implicit-def: $sgpr50
-; SI-NEXT: ; implicit-def: $sgpr71
+; SI-NEXT: ; implicit-def: $sgpr21
; SI-NEXT: ; implicit-def: $sgpr10
; SI-NEXT: ; implicit-def: $sgpr52
; SI-NEXT: ; implicit-def: $sgpr67
@@ -208832,24 +208843,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -208870,10 +208863,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
+; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr53
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr36
-; GFX9-NEXT: ; implicit-def: $vgpr57
; GFX9-NEXT: ; implicit-def: $vgpr35
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: ; implicit-def: $vgpr63
@@ -208881,7 +208873,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr39
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: ; implicit-def: $vgpr60
-; GFX9-NEXT: ; implicit-def: $vgpr59
; GFX9-NEXT: ; implicit-def: $vgpr37
; GFX9-NEXT: ; implicit-def: $vgpr47
; GFX9-NEXT: ; implicit-def: $vgpr49
@@ -208890,13 +208881,33 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr48
; GFX9-NEXT: ; implicit-def: $vgpr62
; GFX9-NEXT: ; kill: killed $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr56
-; GFX9-NEXT: ; implicit-def: $vgpr42
-; GFX9-NEXT: ; implicit-def: $vgpr41
-; GFX9-NEXT: ; implicit-def: $vgpr40
+; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr52
; GFX9-NEXT: ; implicit-def: $vgpr51
+; GFX9-NEXT: ; implicit-def: $vgpr59
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr40
+; GFX9-NEXT: ; implicit-def: $vgpr57
+; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; implicit-def: $vgpr50
+; GFX9-NEXT: ; implicit-def: $vgpr42
+; GFX9-NEXT: ; implicit-def: $vgpr41
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -208904,9 +208915,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
@@ -208930,7 +208944,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(33)
+; GFX9-NEXT: s_waitcnt vmcnt(43)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -208989,180 +209003,195 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB98_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[15:16]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[13:14]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[7:8]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(62)
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
-; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(62)
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20]
-; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
+; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15
; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v29
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
@@ -209172,9 +209201,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17
; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2]
-; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29
; GFX9-NEXT: .LBB98_2: ; %Flow
-; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v43, v50
+; GFX9-NEXT: v_mov_b32_e32 v50, v40
+; GFX9-NEXT: v_mov_b32_e32 v40, v55
+; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB98_4
; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
@@ -209186,12 +209219,36 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14]
-; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: s_waitcnt vmcnt(34)
-; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12]
+; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10]
+; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8]
+; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6]
+; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: s_waitcnt vmcnt(52)
+; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
@@ -209223,164 +209280,149 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22]
+; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8
-; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6
-; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6
; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4
; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4
; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2
+; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22
-; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21
; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4]
-; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20
; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2]
-; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18]
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15
; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32
+; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19
@@ -209397,41 +209439,50 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61
; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39
-; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38
-; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60
-; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59
-; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
-; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -209440,84 +209491,103 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49
; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42
-; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -209527,16 +209597,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -209546,23 +209616,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60
+; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53
; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
@@ -209570,14 +209633,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33
-; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62
; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_waitcnt vmcnt(1)
@@ -209586,11 +209653,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -209599,10 +209666,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -209612,11 +209679,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -209625,10 +209692,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -209638,11 +209705,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -209651,10 +209718,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -209664,11 +209731,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -209677,10 +209744,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -209690,53 +209757,34 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54
; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v59
; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58
; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v57
+; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43
; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -236672,75 +236720,76 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84
; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v6
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v43, v7
; SI-NEXT: v_cvt_f16_f32_e32 v40, v8
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v10
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v22
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v13
; SI-NEXT: v_cvt_f16_f32_e32 v53, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v14
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v19
; SI-NEXT: v_cvt_f16_f32_e32 v19, v23
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v23, v25
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v15, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v23, v30
; SI-NEXT: v_cvt_f16_f32_e32 v44, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v52, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v14
; SI-NEXT: v_cvt_f16_f32_e32 v4, v17
; SI-NEXT: v_cvt_f16_f32_e32 v13, v20
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v21
; SI-NEXT: v_cvt_f16_f32_e32 v20, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v26
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v26, v29
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; SI-NEXT: v_cvt_f16_f32_e32 v31, v27
; SI-NEXT: v_cvt_f16_f32_e32 v25, v50
-; SI-NEXT: v_cvt_f16_f32_e32 v27, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v21, v47
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v35
+; SI-NEXT: v_cvt_f16_f32_e32 v27, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v29, v51
; SI-NEXT: v_cvt_f16_f32_e32 v22, v38
; SI-NEXT: v_cvt_f16_f32_e32 v38, v49
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v21, v47
+; SI-NEXT: v_cvt_f16_f32_e32 v16, v41
; SI-NEXT: v_cvt_f16_f32_e32 v47, v54
-; SI-NEXT: v_cvt_f16_f32_e32 v24, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v54, v46
+; SI-NEXT: v_cvt_f16_f32_e32 v41, v46
; SI-NEXT: v_cvt_f16_f32_e32 v42, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v29, v51
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v41
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v59
; SI-NEXT: v_cvt_f16_f32_e32 v17, v45
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v58
; SI-NEXT: v_cvt_f16_f32_e32 v11, v57
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v58
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v62
+; SI-NEXT: v_cvt_f16_f32_e32 v54, v59
; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v61
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(13) expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v6, v62
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_cvt_f16_f32_e32 v49, v32
; SI-NEXT: v_cvt_f16_f32_e32 v51, v60
-; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v37
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v33
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100
@@ -236749,18 +236798,17 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v61
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v33
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v37
; SI-NEXT: v_cvt_f16_f32_e32 v33, v63
; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v45, v39
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v6
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v7
; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f16_f32_e32 v41, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v8
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f16_f32_e32 v30, v10
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124
@@ -236781,23 +236829,33 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
+; SI-NEXT: v_mov_b32_e32 v63, v49
+; SI-NEXT: v_mov_b32_e32 v49, v54
+; SI-NEXT: v_mov_b32_e32 v54, v41
+; SI-NEXT: v_mov_b32_e32 v41, v9
+; SI-NEXT: v_mov_b32_e32 v56, v5
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v5, v34
+; SI-NEXT: v_mov_b32_e32 v1, v55
+; SI-NEXT: v_mov_b32_e32 v55, v43
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v59, v29
-; SI-NEXT: v_mov_b32_e32 v29, v27
-; SI-NEXT: v_mov_b32_e32 v57, v23
+; SI-NEXT: v_mov_b32_e32 v29, v26
+; SI-NEXT: v_mov_b32_e32 v57, v15
; SI-NEXT: v_mov_b32_e32 v60, v3
; SI-NEXT: v_mov_b32_e32 v62, v4
-; SI-NEXT: v_mov_b32_e32 v63, v49
-; SI-NEXT: v_mov_b32_e32 v49, v12
+; SI-NEXT: v_mov_b32_e32 v26, v14
; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB108_2
; SI-NEXT: ; %bb.1: ; %cmp.true
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v15, v58
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v61
; SI-NEXT: v_cvt_f32_f16_e32 v36, v36
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f32_f16_e32 v33, v33
@@ -236809,7 +236867,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v15
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
@@ -236820,21 +236878,18 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v6, v6, v37
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v8
; SI-NEXT: v_or_b32_e32 v7, v7, v37
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15
-; SI-NEXT: v_or_b32_e32 v14, v14, v37
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58
+; SI-NEXT: v_or_b32_e32 v61, v14, v37
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v36
; SI-NEXT: v_or_b32_e32 v33, v33, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v58
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v58, v12
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
@@ -236867,240 +236922,236 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v52
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v34, v34
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v61
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v12
+; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4
-; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT: v_or_b32_e32 v61, v3, v37
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58
-; SI-NEXT: v_or_b32_e32 v11, v11, v37
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17
-; SI-NEXT: v_or_b32_e32 v16, v16, v37
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; SI-NEXT: v_or_b32_e32 v21, v21, v37
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25
-; SI-NEXT: v_or_b32_e32 v24, v24, v37
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; SI-NEXT: v_or_b32_e32 v31, v31, v37
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20
-; SI-NEXT: v_or_b32_e32 v19, v19, v37
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13
-; SI-NEXT: v_or_b32_e32 v18, v18, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v48
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v37
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v3, v3, v37
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48
-; SI-NEXT: v_or_b32_e32 v2, v2, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v53
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v37
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v53
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT: v_or_b32_e32 v52, v37, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v55
-; SI-NEXT: v_cvt_f32_f16_e32 v56, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v40, v37
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
-; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v40
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
-; SI-NEXT: v_or_b32_e32 v55, v37, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v43
; SI-NEXT: v_cvt_f32_f16_e32 v57, v57
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v44, v37
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT: v_or_b32_e32 v43, v37, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
; SI-NEXT: v_cvt_f32_f16_e32 v47, v47
; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
+; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
; SI-NEXT: v_cvt_f32_f16_e32 v35, v35
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v38, v38
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
+; SI-NEXT: v_cvt_f32_f16_e32 v49, v49
; SI-NEXT: v_cvt_f32_f16_e32 v45, v45
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_cvt_f32_f16_e32 v46, v46
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
+; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47
; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
+; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32
+; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29
; SI-NEXT: v_cvt_f16_f32_e32 v47, v47
; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38
; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
; SI-NEXT: v_cvt_f16_f32_e32 v32, v32
; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
-; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
; SI-NEXT: v_cvt_f16_f32_e32 v45, v45
-; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
+; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
+; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; SI-NEXT: v_or_b32_e32 v29, v29, v23
; SI-NEXT: v_or_b32_e32 v38, v38, v47
; SI-NEXT: v_or_b32_e32 v54, v54, v42
+; SI-NEXT: v_or_b32_e32 v49, v49, v51
; SI-NEXT: v_or_b32_e32 v45, v45, v50
-; SI-NEXT: v_or_b32_e32 v41, v41, v30
; SI-NEXT: v_or_b32_e32 v46, v46, v32
+; SI-NEXT: v_alignbit_b32 v50, v61, v50, 16
+; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v4
+; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v12
+; SI-NEXT: v_mov_b32_e32 v12, v3
+; SI-NEXT: v_mov_b32_e32 v3, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v52
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v11, v11, v37
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17
+; SI-NEXT: v_or_b32_e32 v16, v16, v37
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22
+; SI-NEXT: v_or_b32_e32 v21, v21, v37
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25
+; SI-NEXT: v_or_b32_e32 v24, v24, v37
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; SI-NEXT: v_or_b32_e32 v31, v31, v37
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20
+; SI-NEXT: v_or_b32_e32 v19, v19, v37
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13
+; SI-NEXT: v_or_b32_e32 v18, v18, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v48
+; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16
; SI-NEXT: v_alignbit_b32 v47, v16, v47, 16
; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16
-; SI-NEXT: v_alignbit_b32 v50, v14, v50, 16
-; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16
-; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v3
-; SI-NEXT: v_or_b32_e32 v3, v37, v34
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v37
+; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16
+; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48
+; SI-NEXT: v_or_b32_e32 v2, v2, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v53
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v37
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v53
+; SI-NEXT: v_or_b32_e32 v52, v37, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v55
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v40, v37
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v40
+; SI-NEXT: v_or_b32_e32 v55, v37, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v44
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v43
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v44, v37
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44
+; SI-NEXT: v_or_b32_e32 v43, v37, v39
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v4
+; SI-NEXT: v_or_b32_e32 v4, v37, v34
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: v_alignbit_b32 v34, v43, v34, 16
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v3, v39, v1
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: v_or_b32_e32 v4, v39, v1
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: v_alignbit_b32 v1, v55, v1, 16
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v4
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v3
-; SI-NEXT: v_or_b32_e32 v3, v37, v5
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v4
+; SI-NEXT: v_or_b32_e32 v4, v37, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v56, v62
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_or_b32_e32 v3, v39, v9
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v4, v39, v9
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v41
; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37
; SI-NEXT: v_or_b32_e32 v62, v56, v37
; SI-NEXT: v_cvt_f32_f16_e32 v56, v60
-; SI-NEXT: v_alignbit_b32 v9, v2, v9, 16
-; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
-; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v3
+; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
+; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
; SI-NEXT: v_or_b32_e32 v60, v56, v39
; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v57
; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
+; SI-NEXT: v_or_b32_e32 v3, v41, v30
+; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16
+; SI-NEXT: v_alignbit_b32 v9, v2, v9, 16
; SI-NEXT: v_or_b32_e32 v57, v56, v26
; SI-NEXT: v_cvt_f32_f16_e32 v56, v59
+; SI-NEXT: v_alignbit_b32 v41, v19, v39, 16
+; SI-NEXT: v_mov_b32_e32 v39, v3
; SI-NEXT: v_alignbit_b32 v26, v31, v26, 16
; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT: v_or_b32_e32 v29, v29, v23
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v27, v3
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: v_or_b32_e32 v59, v56, v27
; SI-NEXT: v_cvt_f32_f16_e32 v56, v63
+; SI-NEXT: v_alignbit_b32 v27, v21, v27, 16
; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56
; SI-NEXT: v_cvt_f16_f32_e32 v56, v56
; SI-NEXT: v_or_b32_e32 v63, v56, v35
+; SI-NEXT: v_alignbit_b32 v56, v18, v37, 16
; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v49, v3
-; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT: v_or_b32_e32 v3, v49, v51
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v3, v19, v39, 16
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v3, v24, v23, 16
-; SI-NEXT: v_alignbit_b32 v49, v18, v37, 16
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v3, v21, v27, 16
-; SI-NEXT: v_alignbit_b32 v51, v61, v51, 16
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: .LBB108_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
@@ -237114,7 +237165,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v34, v34, v37
; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0
; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_and_b32_e32 v34, 0xffff, v3
; SI-NEXT: v_or_b32_e32 v1, v34, v1
@@ -237127,7 +237178,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0
; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v5
@@ -237140,7 +237191,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -237155,7 +237206,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -237165,11 +237216,9 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -237191,11 +237240,9 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -237205,11 +237252,9 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -237237,23 +237282,23 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
@@ -237278,13 +237323,13 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 41b86c0960b46..7b02ad0765689 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -47208,33 +47208,32 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v43, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v4
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v4, v5
; SI-NEXT: v_cvt_f16_f32_e32 v47, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v8
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v8
; SI-NEXT: v_cvt_f16_f32_e32 v46, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v11
; SI-NEXT: v_cvt_f16_f32_e32 v8, v12
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v45, v14
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v15
; SI-NEXT: v_cvt_f16_f32_e32 v12, v16
; SI-NEXT: v_cvt_f16_f32_e32 v13, v17
; SI-NEXT: v_cvt_f16_f32_e32 v44, v18
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v15
; SI-NEXT: v_cvt_f16_f32_e32 v10, v19
; SI-NEXT: v_cvt_f16_f32_e32 v17, v20
; SI-NEXT: v_cvt_f16_f32_e32 v11, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v22
; SI-NEXT: v_cvt_f16_f32_e32 v19, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v24
; SI-NEXT: v_cvt_f16_f32_e32 v9, v25
@@ -47243,44 +47242,44 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v18, v28
; SI-NEXT: v_cvt_f16_f32_e32 v7, v29
; SI-NEXT: v_cvt_f16_f32_e32 v25, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f16_f32_e32 v22, v40
; SI-NEXT: v_cvt_f16_f32_e32 v20, v57
; SI-NEXT: v_cvt_f16_f32_e32 v57, v58
; SI-NEXT: v_cvt_f16_f32_e32 v40, v59
; SI-NEXT: v_cvt_f16_f32_e32 v15, v60
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v61
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v61
; SI-NEXT: v_cvt_f16_f32_e32 v62, v62
; SI-NEXT: v_cvt_f16_f32_e32 v30, v63
; SI-NEXT: v_cvt_f16_f32_e32 v16, v33
; SI-NEXT: v_cvt_f16_f32_e32 v63, v35
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v49
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v49
; SI-NEXT: v_cvt_f16_f32_e32 v29, v50
; SI-NEXT: v_cvt_f16_f32_e32 v35, v53
; SI-NEXT: v_cvt_f16_f32_e32 v33, v55
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v58, v31
; SI-NEXT: v_cvt_f16_f32_e32 v28, v32
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v34
; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v50, v34
; SI-NEXT: s_waitcnt vmcnt(12)
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_cvt_f16_f32_e32 v27, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v37
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v27, v38
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_cvt_f16_f32_e32 v31, v48
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v39
; SI-NEXT: s_waitcnt vmcnt(8)
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v51
+; SI-NEXT: v_cvt_f16_f32_e32 v31, v48
; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_cvt_f16_f32_e32 v26, v52
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v51
; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v26, v52
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v41
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v54
; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v41
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -47290,47 +47289,34 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_mov_b32_e32 v54, v9
; SI-NEXT: v_mov_b32_e32 v55, v11
; SI-NEXT: v_mov_b32_e32 v41, v13
-; SI-NEXT: v_mov_b32_e32 v48, v5
+; SI-NEXT: v_mov_b32_e32 v48, v4
+; SI-NEXT: v_mov_b32_e32 v4, v3
+; SI-NEXT: v_mov_b32_e32 v3, v43
; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB58_2
; SI-NEXT: ; %bb.1: ; %cmp.true
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v37, v56
-; SI-NEXT: v_mov_b32_e32 v7, v39
-; SI-NEXT: v_cvt_f32_f16_e32 v39, v47
; SI-NEXT: v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v39
-; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48
-; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
; SI-NEXT: v_cvt_f32_f16_e32 v11, v33
-; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v25
-; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_cvt_f32_f16_e32 v13, v35
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v33, v11
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
@@ -47349,6 +47335,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
+; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
@@ -47367,41 +47354,50 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v5
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v7
+; SI-NEXT: v_mov_b32_e32 v7, v39
+; SI-NEXT: v_cvt_f32_f16_e32 v39, v47
+; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37
; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT: v_or_b32_e32 v5, v38, v47
+; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v39
+; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48
+; SI-NEXT: v_or_b32_e32 v9, v38, v47
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v38, v46
; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v37
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v9
-; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
; SI-NEXT: v_or_b32_e32 v48, v39, v46
; SI-NEXT: v_cvt_f32_f16_e32 v39, v45
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v38
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v39
; SI-NEXT: v_cvt_f32_f16_e32 v39, v41
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v9
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v38, v9
; SI-NEXT: v_or_b32_e32 v9, v37, v45
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -47412,35 +47408,37 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_or_b32_e32 v9, v38, v57
-; SI-NEXT: v_cvt_f32_f16_e32 v38, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v38, v14
; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v37
; SI-NEXT: v_cvt_f32_f16_e32 v37, v55
; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38
; SI-NEXT: v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT: v_or_b32_e32 v41, v39, v43
-; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, v58
+; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v25
; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v38
; SI-NEXT: v_cvt_f32_f16_e32 v38, v54
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v63
-; SI-NEXT: v_or_b32_e32 v55, v37, v42
-; SI-NEXT: v_cvt_f32_f16_e32 v37, v52
-; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38
-; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT: v_or_b32_e32 v55, v37, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v37, v52
; SI-NEXT: v_or_b32_e32 v54, v25, v58
; SI-NEXT: v_cvt_f32_f16_e32 v25, v40
; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v21, v51
-; SI-NEXT: v_or_b32_e32 v52, v37, v40
+; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v37
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT: v_or_b32_e32 v52, v37, v40
; SI-NEXT: v_cvt_f16_f32_e32 v37, v30
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v25, v62
@@ -47451,22 +47449,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v29, v61
+; SI-NEXT: v_cvt_f32_f16_e32 v29, v49
; SI-NEXT: v_or_b32_e32 v62, v25, v59
; SI-NEXT: v_cvt_f32_f16_e32 v25, v28
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v49
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v9
; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: v_or_b32_e32 v61, v29, v28
+; SI-NEXT: v_or_b32_e32 v49, v29, v28
; SI-NEXT: v_cvt_f16_f32_e32 v29, v27
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25
; SI-NEXT: v_cvt_f32_f16_e32 v25, v60
-; SI-NEXT: v_or_b32_e32 v49, v21, v27
+; SI-NEXT: v_or_b32_e32 v38, v21, v27
; SI-NEXT: v_cvt_f32_f16_e32 v21, v26
; SI-NEXT: v_cvt_f32_f16_e32 v26, v7
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
@@ -47480,6 +47478,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v25, v32
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v21
; SI-NEXT: v_cvt_f32_f16_e32 v21, v34
+; SI-NEXT: v_or_b32_e32 v41, v39, v43
; SI-NEXT: v_or_b32_e32 v39, v29, v26
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; SI-NEXT: v_cvt_f32_f16_e32 v29, v31
@@ -47493,14 +47492,19 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v34, v21, v25
; SI-NEXT: v_cvt_f32_f16_e32 v25, v36
; SI-NEXT: v_cvt_f32_f16_e32 v29, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v61
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_cvt_f16_f32_e32 v36, v25
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v63, v9
+; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v31
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v14
; SI-NEXT: v_or_b32_e32 v53, v7, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36
; SI-NEXT: v_or_b32_e32 v50, v25, v21
@@ -47508,18 +47512,17 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v35, v13, v21
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63
; SI-NEXT: v_or_b32_e32 v16, v16, v21
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61
; SI-NEXT: v_or_b32_e32 v15, v15, v21
; SI-NEXT: v_cvt_f32_f16_e32 v21, v22
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20
; SI-NEXT: v_alignbit_b32 v29, v35, v28, 16
+; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16
+; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16
+; SI-NEXT: v_mov_b32_e32 v60, v37
+; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16
; SI-NEXT: v_or_b32_e32 v22, v21, v22
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18
; SI-NEXT: v_or_b32_e32 v24, v24, v21
@@ -47540,19 +47543,15 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16
; SI-NEXT: v_alignbit_b32 v45, v1, v57, 16
; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16
-; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16
+; SI-NEXT: v_alignbit_b32 v14, v19, v42, 16
; SI-NEXT: v_alignbit_b32 v21, v24, v58, 16
+; SI-NEXT: v_mov_b32_e32 v58, v38
; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16
; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16
; SI-NEXT: v_alignbit_b32 v30, v16, v59, 16
-; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16
-; SI-NEXT: v_mov_b32_e32 v60, v37
-; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: .LBB58_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v56
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -47561,7 +47560,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v37, 0xffff, v5
+; SI-NEXT: v_and_b32_e32 v37, 0xffff, v7
; SI-NEXT: v_or_b32_e32 v37, v37, v38
; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen
; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
@@ -47585,10 +47584,8 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0
; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
@@ -47619,7 +47616,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -47661,7 +47658,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -47678,7 +47675,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
@@ -47690,7 +47687,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 4372f11f8ab4a..dbc0f96ea4da6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -51080,79 +51080,79 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84
; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v55, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v22
; SI-NEXT: v_cvt_f16_f32_e32 v40, v4
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v52, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v60, v10
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v6, v13
; SI-NEXT: v_cvt_f16_f32_e32 v37, v15
-; SI-NEXT: v_cvt_f16_f32_e32 v60, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v18
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v9
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v19
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v9
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v5
; SI-NEXT: v_cvt_f16_f32_e32 v48, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v12
; SI-NEXT: v_cvt_f16_f32_e32 v38, v16
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v18
+; SI-NEXT: v_cvt_f16_f32_e32 v16, v22
; SI-NEXT: v_cvt_f16_f32_e32 v11, v23
; SI-NEXT: v_cvt_f16_f32_e32 v5, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v18, v25
; SI-NEXT: v_cvt_f16_f32_e32 v24, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v27
; SI-NEXT: v_cvt_f16_f32_e32 v27, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v25, v29
; SI-NEXT: v_cvt_f16_f32_e32 v28, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51
; SI-NEXT: v_cvt_f16_f32_e32 v22, v43
; SI-NEXT: v_cvt_f16_f32_e32 v23, v44
-; SI-NEXT: v_cvt_f16_f32_e32 v61, v61
+; SI-NEXT: v_cvt_f16_f32_e32 v29, v61
; SI-NEXT: v_cvt_f16_f32_e32 v44, v62
-; SI-NEXT: v_cvt_f16_f32_e32 v18, v63
+; SI-NEXT: v_cvt_f16_f32_e32 v62, v63
; SI-NEXT: v_cvt_f16_f32_e32 v19, v33
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v61, v36
; SI-NEXT: v_cvt_f16_f32_e32 v43, v39
; SI-NEXT: v_cvt_f16_f32_e32 v15, v50
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v54
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v54
; SI-NEXT: v_cvt_f16_f32_e32 v54, v41
; SI-NEXT: v_cvt_f16_f32_e32 v51, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v45
-; SI-NEXT: s_waitcnt vmcnt(13)
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v45
; SI-NEXT: v_cvt_f16_f32_e32 v13, v46
-; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f16_f32_e32 v50, v47
-; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_cvt_f16_f32_e32 v30, v56
+; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v57
; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v57
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v58
; SI-NEXT: s_waitcnt vmcnt(9)
-; SI-NEXT: v_cvt_f16_f32_e32 v62, v58
-; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cvt_f16_f32_e32 v36, v31
-; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: s_waitcnt vmcnt(8)
; SI-NEXT: v_cvt_f16_f32_e32 v45, v32
-; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: s_waitcnt vmcnt(7)
; SI-NEXT: v_cvt_f16_f32_e32 v31, v59
-; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f16_f32_e32 v32, v34
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v35
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v35
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96
; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100
@@ -51177,11 +51177,21 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
+; SI-NEXT: v_mov_b32_e32 v58, v8
+; SI-NEXT: v_mov_b32_e32 v8, v60
+; SI-NEXT: v_mov_b32_e32 v46, v52
+; SI-NEXT: v_mov_b32_e32 v52, v55
+; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v47, v21
; SI-NEXT: v_mov_b32_e32 v56, v17
; SI-NEXT: v_mov_b32_e32 v57, v6
-; SI-NEXT: v_mov_b32_e32 v58, v7
-; SI-NEXT: v_mov_b32_e32 v59, v33
+; SI-NEXT: v_mov_b32_e32 v59, v61
+; SI-NEXT: v_mov_b32_e32 v61, v29
+; SI-NEXT: v_mov_b32_e32 v29, v25
+; SI-NEXT: v_mov_b32_e32 v25, v18
+; SI-NEXT: v_mov_b32_e32 v21, v16
+; SI-NEXT: v_mov_b32_e32 v17, v1
; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
; SI-NEXT: s_cbranch_execz .LBB58_2
; SI-NEXT: ; %bb.1: ; %cmp.true
@@ -51191,12 +51201,14 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v32, v32
; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v6, v12
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v14
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
@@ -51206,35 +51218,36 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v62, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_or_b32_e32 v3, v3, v34
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10
+; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_or_b32_e32 v9, v9, v34
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32
-; SI-NEXT: v_or_b32_e32 v31, v31, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v62
-; SI-NEXT: v_or_b32_e32 v63, v6, v34
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT: v_or_b32_e32 v1, v31, v34
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v7
+; SI-NEXT: v_or_b32_e32 v6, v6, v34
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13
+; SI-NEXT: v_or_b32_e32 v31, v12, v34
+; SI-NEXT: v_mov_b32_e32 v12, v6
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v63
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v18, v62
; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v16
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
@@ -51248,13 +51261,11 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT: v_or_b32_e32 v12, v12, v34
-; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16
+; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v63
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
@@ -51264,7 +51275,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT: v_or_b32_e32 v18, v18, v34
+; SI-NEXT: v_or_b32_e32 v62, v18, v34
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_or_b32_e32 v22, v22, v34
@@ -51276,8 +51287,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v2, v2, v34
; SI-NEXT: v_cvt_f32_f16_e32 v34, v38
; SI-NEXT: v_cvt_f32_f16_e32 v35, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v46
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v38, v34
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
@@ -51289,79 +51301,89 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f32_f16_e32 v34, v49
; SI-NEXT: v_cvt_f32_f16_e32 v35, v48
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v58
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v49, v34
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v58
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_or_b32_e32 v48, v34, v35
; SI-NEXT: v_cvt_f32_f16_e32 v34, v53
; SI-NEXT: v_cvt_f32_f16_e32 v35, v52
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v17
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v53, v34
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53
-; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
; SI-NEXT: v_or_b32_e32 v52, v34, v35
; SI-NEXT: v_cvt_f32_f16_e32 v34, v40
+; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v35, v55
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v40, v34
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40
-; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43
; SI-NEXT: v_or_b32_e32 v55, v34, v35
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v34, v60
-; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v35, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v43
-; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT: v_cvt_f16_f32_e32 v43, v43
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34
; SI-NEXT: v_cvt_f16_f32_e32 v34, v34
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
+; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
+; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_or_b32_e32 v6, v35, v34
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT: v_cvt_f32_f16_e32 v51, v51
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
+; SI-NEXT: v_cvt_f32_f16_e32 v42, v42
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v41
; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
; SI-NEXT: v_cvt_f32_f16_e32 v29, v29
; SI-NEXT: v_cvt_f32_f16_e32 v54, v54
; SI-NEXT: v_cvt_f32_f16_e32 v50, v50
; SI-NEXT: v_cvt_f32_f16_e32 v36, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v33, v33
; SI-NEXT: v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43
; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51
; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
+; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42
; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41
; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
; SI-NEXT: v_cvt_f16_f32_e32 v28, v28
; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v43
; SI-NEXT: v_cvt_f16_f32_e32 v51, v51
; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54
; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50
; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v42, v42
+; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33
; SI-NEXT: v_cvt_f16_f32_e32 v41, v41
; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39
; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
@@ -51369,102 +51391,91 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_cvt_f16_f32_e32 v54, v54
; SI-NEXT: v_cvt_f16_f32_e32 v50, v50
; SI-NEXT: v_cvt_f16_f32_e32 v36, v36
+; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
; SI-NEXT: v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43
; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51
; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42
; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41
; SI-NEXT: v_or_b32_e32 v25, v25, v24
; SI-NEXT: v_or_b32_e32 v29, v29, v28
; SI-NEXT: v_or_b32_e32 v54, v54, v51
; SI-NEXT: v_or_b32_e32 v50, v50, v30
+; SI-NEXT: v_or_b32_e32 v33, v33, v42
; SI-NEXT: v_or_b32_e32 v39, v39, v41
; SI-NEXT: v_alignbit_b32 v60, v55, v34, 16
; SI-NEXT: v_alignbit_b32 v24, v26, v24, 16
; SI-NEXT: v_alignbit_b32 v28, v22, v28, 16
-; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16
-; SI-NEXT: v_alignbit_b32 v30, v63, v30, 16
+; SI-NEXT: v_alignbit_b32 v30, v12, v30, 16
+; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16
; SI-NEXT: v_alignbit_b32 v41, v3, v41, 16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v35, v6
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_or_b32_e32 v6, v35, v1
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46
; SI-NEXT: v_cvt_f32_f16_e32 v46, v57
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT: v_alignbit_b32 v1, v52, v1, 16
; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46
; SI-NEXT: v_or_b32_e32 v58, v35, v8
; SI-NEXT: v_cvt_f32_f16_e32 v35, v56
; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
; SI-NEXT: v_alignbit_b32 v8, v48, v8, 16
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT: v_or_b32_e32 v57, v46, v14
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v46, v47
-; SI-NEXT: v_alignbit_b32 v14, v37, v14, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
; SI-NEXT: v_or_b32_e32 v56, v35, v17
+; SI-NEXT: v_alignbit_b32 v17, v2, v17, 16
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v6
+; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT: v_or_b32_e32 v57, v46, v14
+; SI-NEXT: v_cvt_f32_f16_e32 v46, v47
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_cvt_f32_f16_e32 v46, v59
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46
; SI-NEXT: v_cvt_f16_f32_e32 v46, v46
-; SI-NEXT: v_or_b32_e32 v59, v46, v43
-; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v6
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_or_b32_e32 v47, v35, v21
; SI-NEXT: v_cvt_f32_f16_e32 v35, v44
; SI-NEXT: v_cvt_f32_f16_e32 v44, v61
+; SI-NEXT: v_or_b32_e32 v59, v46, v43
+; SI-NEXT: v_alignbit_b32 v46, v52, v1, 16
+; SI-NEXT: v_alignbit_b32 v1, v37, v14, 16
+; SI-NEXT: v_mov_b32_e32 v14, v31
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35
; SI-NEXT: v_cvt_f16_f32_e32 v35, v35
; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44
; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
+; SI-NEXT: v_alignbit_b32 v21, v11, v21, 16
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35
+; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16
; SI-NEXT: v_or_b32_e32 v61, v44, v35
; SI-NEXT: v_cvt_f32_f16_e32 v44, v45
+; SI-NEXT: v_alignbit_b32 v51, v14, v51, 16
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44
; SI-NEXT: v_cvt_f16_f32_e32 v44, v44
; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44
; SI-NEXT: v_or_b32_e32 v36, v36, v45
-; SI-NEXT: v_alignbit_b32 v44, v18, v35, 16
+; SI-NEXT: v_alignbit_b32 v44, v62, v35, 16
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_alignbit_b32 v45, v31, v45, 16
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v33, v6
-; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT: v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT: v_or_b32_e32 v6, v33, v42
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v2, v17, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_alignbit_b32 v6, v11, v21, 16
-; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: .LBB58_2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6
@@ -51476,7 +51487,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v34, v34, v35
; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0
; SI-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6
; SI-NEXT: v_or_b32_e32 v1, v34, v1
@@ -51499,9 +51510,11 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0
; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
; SI-NEXT: v_or_b32_e32 v1, v1, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0
; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
@@ -51511,11 +51524,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0
; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17
; SI-NEXT: v_or_b32_e32 v1, v1, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0
; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen
@@ -51525,11 +51536,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -51570,7 +51579,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0
@@ -51583,7 +51592,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -51594,7 +51603,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0
@@ -51606,8 +51615,8 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -51624,10 +51633,8 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0
; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
index 4a0bb6ceccd3f..1b5d4a9c1b929 100644
--- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
+++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
@@ -42,14 +42,11 @@ body: |
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr18_sgpr19 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec
- ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
- ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
- ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
- ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $sgpr20_sgpr21 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr22_sgpr23 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0
- ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec
- ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $sgpr24_sgpr25 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec
; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240
@@ -58,7 +55,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.17(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]]
@@ -67,7 +64,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.5(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr72
@@ -95,12 +92,12 @@ body: |
; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr56_sgpr57
; CHECK-NEXT: renamable $sgpr54 = COPY killed renamable $sgpr76
- ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
- ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50 = COPY renamable $sgpr52_sgpr53_sgpr54
- ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
- ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr48_sgpr49_sgpr50
- ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
- ; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr68
+ ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58 = COPY renamable $sgpr52_sgpr53_sgpr54
+ ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
+ ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+ ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr56_sgpr57_sgpr58
+ ; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr76
; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72
; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
@@ -165,23 +162,22 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.5:
; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.6(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5)
- ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 renamable $sgpr22_sgpr23, undef renamable $sgpr54_sgpr55, implicit-def dead $scc
; CHECK-NEXT: renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.6:
; CHECK-NEXT: successors: %bb.7(0x80000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.7:
; CHECK-NEXT: successors: %bb.8(0x80000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
; CHECK-NEXT: renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
@@ -189,14 +185,14 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.8:
; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr64_sgpr65, implicit-def dead $scc
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.10, implicit $vcc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.9:
; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.17(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec
@@ -218,11 +214,17 @@ body: |
; CHECK-NEXT: renamable $sgpr83 = COPY killed renamable $sgpr15
; CHECK-NEXT: renamable $sgpr85 = COPY killed renamable $sgpr14
; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr18_sgpr19
+ ; CHECK-NEXT: renamable $sgpr50_sgpr51 = COPY killed renamable $sgpr20_sgpr21
+ ; CHECK-NEXT: renamable $sgpr36_sgpr37 = COPY killed renamable $sgpr22_sgpr23
+ ; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY killed renamable $sgpr24_sgpr25
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9
+ ; CHECK-NEXT: renamable $sgpr24_sgpr25 = COPY killed renamable $sgpr38_sgpr39
+ ; CHECK-NEXT: renamable $sgpr22_sgpr23 = COPY killed renamable $sgpr36_sgpr37
+ ; CHECK-NEXT: renamable $sgpr20_sgpr21 = COPY killed renamable $sgpr50_sgpr51
; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49
; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85
; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr83
@@ -238,44 +240,42 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.10:
; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.12(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.12
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.11:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.17(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.17
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.12:
; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.13(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr54_sgpr55
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.13:
; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.14(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
- ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
+ ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.15, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.14
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.14:
; CHECK-NEXT: successors: %bb.15(0x80000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.15:
; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.16(0x40000000)
- ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101
+ ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
- ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
+ ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr20_sgpr21, implicit-def dead $scc
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.11, implicit $vcc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.16:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir
index 81f72b70d1ecb..24c7480240608 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir
@@ -42,11 +42,13 @@ body: |
%24:sgpr_128 = COPY %1
%25:sgpr_128 = COPY %1
%26:sgpr_128 = COPY %1
+ %27:sgpr_128 = COPY %1
S_BRANCH %bb.1
bb.1:
liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103
+ %27 = IMPLICIT_DEF implicit-def $exec
%0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec
$exec = S_XOR_B64_term $exec, %0, implicit-def $scc
S_CBRANCH_EXECZ %bb.3, implicit $exec
@@ -55,6 +57,7 @@ body: |
bb.2:
liveins: $sgpr98_sgpr99, $sgpr102_sgpr103
+ %27 = IMPLICIT_DEF implicit-def $exec
%0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec
$exec = S_XOR_B64_term $exec, %0, implicit-def $scc
S_CBRANCH_EXECZ %bb.3, implicit $exec
@@ -63,6 +66,7 @@ body: |
bb.3:
liveins: $sgpr102_sgpr103
+ %27 = IMPLICIT_DEF implicit-def $exec
%0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec
$exec = S_XOR_B64_term $exec, %0, implicit-def $scc
S_BRANCH %bb.4
@@ -81,6 +85,7 @@ body: |
S_CMP_EQ_U64 %21.sub0_sub1, %22.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %23.sub0_sub1, %24.sub2_sub3, implicit-def $scc
S_CMP_EQ_U64 %25.sub0_sub1, %26.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %26.sub0_sub1, %27.sub2_sub3, implicit-def $scc
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %0, implicit $vgpr0
...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir
new file mode 100644
index 0000000000000..a44f5b477f052
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir
@@ -0,0 +1,167 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -run-pass=greedy -o - %s | FileCheck %s
+
+---
+# Check that spill save/restore should be inserted after $exec mask is defined.
+
+name: foo
+tracksRegLiveness: true
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+body: |
+ ; CHECK-LABEL: name: foo
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr102_sgpr103
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+ ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5)
+ ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.3, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.3, align 4, addrspace 5)
+ ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.4, align 4, addrspace 5)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY19:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY21:%[0-9]+]]:sgpr_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr98_sgpr99, $sgpr102_sgpr103
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr102_sgpr103
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY1]].sub0_sub1, [[SI_SPILL_S128_RESTORE]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE1:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5)
+ ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE2:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE1]].sub0_sub1, [[SI_SPILL_S128_RESTORE2]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE3:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.3, align 4, addrspace 5)
+ ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE4:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.4, align 4, addrspace 5)
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE3]].sub0_sub1, [[SI_SPILL_S128_RESTORE4]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY2]].sub0_sub1, [[COPY3]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY4]].sub0_sub1, [[COPY5]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY6]].sub0_sub1, [[COPY7]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY8]].sub0_sub1, [[COPY9]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY10]].sub0_sub1, [[COPY11]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY12]].sub0_sub1, [[COPY13]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY14]].sub0_sub1, [[COPY15]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY16]].sub0_sub1, [[COPY17]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY18]].sub0_sub1, [[COPY19]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY20]].sub0_sub1, [[COPY21]].sub2_sub3, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit [[S_OR_SAVEEXEC_B64_1]], implicit $vgpr0
+ bb.0:
+ liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103
+
+ %0:sreg_64 = COPY $sgpr102_sgpr103
+ %1:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
+ %2:sgpr_128 = COPY %1
+ %3:sgpr_128 = COPY %1
+ %4:sgpr_128 = COPY %1
+ %5:sgpr_128 = COPY %1
+ %6:sgpr_128 = COPY %1
+ %7:sgpr_128 = COPY %1
+ %8:sgpr_128 = COPY %1
+ %9:sgpr_128 = COPY %1
+ %10:sgpr_128 = COPY %1
+ %11:sgpr_128 = COPY %1
+ %12:sgpr_128 = COPY %1
+ %13:sgpr_128 = COPY %1
+ %14:sgpr_128 = COPY %1
+ %15:sgpr_128 = COPY %1
+ %16:sgpr_128 = COPY %1
+ %17:sgpr_128 = COPY %1
+ %18:sgpr_128 = COPY %1
+ %19:sgpr_128 = COPY %1
+ %20:sgpr_128 = COPY %1
+ %21:sgpr_128 = COPY %1
+ %22:sgpr_128 = COPY %1
+ %23:sgpr_128 = COPY %1
+ %24:sgpr_128 = COPY %1
+ %25:sgpr_128 = COPY %1
+ %26:sgpr_128 = COPY %1
+ S_BRANCH %bb.1
+
+ bb.1:
+ liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103
+
+ %0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec
+ $exec = S_XOR_B64_term $exec, %0, implicit-def $scc
+ S_CBRANCH_EXECZ %bb.3, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ liveins: $sgpr98_sgpr99, $sgpr102_sgpr103
+
+ %0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec
+ $exec = S_XOR_B64_term $exec, %0, implicit-def $scc
+ S_CBRANCH_EXECZ %bb.3, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.3:
+ liveins: $sgpr102_sgpr103
+
+ %0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec
+ $exec = S_XOR_B64_term $exec, %0, implicit-def $scc
+ S_BRANCH %bb.4
+
+ bb.4:
+ $exec = IMPLICIT_DEF
+ S_CMP_EQ_U64 %1.sub0_sub1, %2.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %3.sub0_sub1, %4.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %5.sub0_sub1, %6.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %7.sub0_sub1, %8.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %9.sub0_sub1, %10.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %11.sub0_sub1, %12.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %13.sub0_sub1, %14.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %15.sub0_sub1, %16.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %17.sub0_sub1, %18.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %19.sub0_sub1, %20.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %21.sub0_sub1, %22.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %23.sub0_sub1, %24.sub2_sub3, implicit-def $scc
+ S_CMP_EQ_U64 %25.sub0_sub1, %26.sub2_sub3, implicit-def $scc
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %0, implicit $vgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index b5474b8974b29..1c5f221dd679b 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -9742,170 +9742,122 @@ entry:
define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX6-LABEL: test_limited_sgpr:
; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x9
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0
+; GFX6-NEXT: s_mov_b32 s18, 0
+; GFX6-NEXT: v_mov_b32_e32 v6, 0
+; GFX6-NEXT: s_mov_b32 s19, 0xf000
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[16:17], s[14:15]
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:240
; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
-; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s42, -1
; GFX6-NEXT: s_mov_b32 s43, 0xe8f000
; GFX6-NEXT: s_add_u32 s40, s40, s11
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0
; GFX6-NEXT: s_addc_u32 s41, s41, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0
-; GFX6-NEXT: v_mov_b32_e32 v6, 0
-; GFX6-NEXT: s_mov_b64 s[4:5], exec
-; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b64 exec, 15
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0
-; GFX6-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_writelane_b32 v1, s0, 0
-; GFX6-NEXT: v_writelane_b32 v1, s1, 1
-; GFX6-NEXT: v_writelane_b32 v1, s2, 2
-; GFX6-NEXT: v_writelane_b32 v1, s3, 3
-; GFX6-NEXT: s_mov_b32 s8, 0x80400
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s8 ; 4-byte Folded Spill
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[4:5]
-; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
-; GFX6-NEXT: s_mov_b32 s2, 0x86a00
-; GFX6-NEXT: s_mov_b64 s[8:9], exec
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT: s_mov_b32 s2, 0x86600
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT: s_mov_b32 s2, 0x86200
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT: s_mov_b32 s2, 0x85e00
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT: s_mov_b32 s2, 0x85a00
+; GFX6-NEXT: s_mov_b32 s0, 0x85e00
+; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[16:19], 0 addr64 offset:32
+; GFX6-NEXT: buffer_load_dwordx4 v[16:19], v[5:6], s[16:19], 0 addr64 offset:48
+; GFX6-NEXT: s_waitcnt vmcnt(2)
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:224
+; GFX6-NEXT: s_mov_b32 s0, 0x85a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT: s_mov_b32 s2, 0x85600
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:208
+; GFX6-NEXT: s_mov_b32 s0, 0x85600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT: s_mov_b32 s2, 0x85200
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:192
+; GFX6-NEXT: s_mov_b32 s0, 0x85200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT: s_mov_b32 s2, 0x84e00
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:176
+; GFX6-NEXT: s_mov_b32 s0, 0x84e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT: s_mov_b32 s2, 0x84a00
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:160
+; GFX6-NEXT: s_mov_b32 s0, 0x84a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT: s_mov_b32 s2, 0x84600
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:144
+; GFX6-NEXT: s_mov_b32 s0, 0x84600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT: s_mov_b32 s2, 0x84200
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:128
+; GFX6-NEXT: s_mov_b32 s0, 0x84200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
-; GFX6-NEXT: s_mov_b32 s2, 0x83a00
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:112
+; GFX6-NEXT: s_mov_b32 s0, 0x83e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
-; GFX6-NEXT: s_mov_b32 s2, 0x83200
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:96
+; GFX6-NEXT: s_mov_b32 s0, 0x83a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32
-; GFX6-NEXT: s_mov_b32 s2, 0x83600
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:80
+; GFX6-NEXT: s_mov_b32 s0, 0x83600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX6-NEXT: s_mov_b64 exec, 15
-; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_load_dwordx4 v[20:23], v[5:6], s[16:19], 0 addr64 offset:64
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_writelane_b32 v4, s0, 0
-; GFX6-NEXT: v_writelane_b32 v4, s1, 1
-; GFX6-NEXT: v_writelane_b32 v4, s2, 2
-; GFX6-NEXT: v_writelane_b32 v4, s3, 3
-; GFX6-NEXT: s_mov_b32 s10, 0x80800
-; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[8:9]
-; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT: s_mov_b32 s0, 0x83e00
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4
+; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64
+; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[16:19], 0 addr64 offset:16
+; GFX6-NEXT: s_mov_b32 s0, 0x83200
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4
; GFX6-NEXT: s_waitcnt expcnt(3)
; GFX6-NEXT: v_mov_b32_e32 v7, 1
; GFX6-NEXT: s_mov_b64 s[0:1], exec
@@ -9924,22 +9876,76 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_writelane_b32 v4, s9, 5
; GFX6-NEXT: v_writelane_b32 v4, s10, 6
; GFX6-NEXT: v_writelane_b32 v4, s11, 7
-; GFX6-NEXT: s_mov_b32 s2, 0x80c00
+; GFX6-NEXT: s_mov_b32 s2, 0x80400
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[0:1]
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ; def s[8:15]
+; GFX6-NEXT: ; def s[4:11]
; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: s_mov_b64 exec, 0xff
+; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_writelane_b32 v4, s4, 0
+; GFX6-NEXT: v_writelane_b32 v4, s5, 1
+; GFX6-NEXT: v_writelane_b32 v4, s6, 2
+; GFX6-NEXT: v_writelane_b32 v4, s7, 3
+; GFX6-NEXT: v_writelane_b32 v4, s8, 4
+; GFX6-NEXT: v_writelane_b32 v4, s9, 5
+; GFX6-NEXT: v_writelane_b32 v4, s10, 6
+; GFX6-NEXT: v_writelane_b32 v4, s11, 7
+; GFX6-NEXT: s_mov_b32 s2, 0x80c00
+; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_mov_b64 exec, s[0:1]
+; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ; def s[16:23]
+; GFX6-NEXT: ; def s[4:11]
; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: s_mov_b64 exec, 0xff
+; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_writelane_b32 v4, s4, 0
+; GFX6-NEXT: v_writelane_b32 v4, s5, 1
+; GFX6-NEXT: v_writelane_b32 v4, s6, 2
+; GFX6-NEXT: v_writelane_b32 v4, s7, 3
+; GFX6-NEXT: v_writelane_b32 v4, s8, 4
+; GFX6-NEXT: v_writelane_b32 v4, s9, 5
+; GFX6-NEXT: v_writelane_b32 v4, s10, 6
+; GFX6-NEXT: v_writelane_b32 v4, s11, 7
+; GFX6-NEXT: s_mov_b32 s2, 0x81400
+; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_mov_b64 exec, s[0:1]
; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ; def s[24:31]
+; GFX6-NEXT: ; def s[0:7]
; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: s_mov_b64 s[8:9], exec
+; GFX6-NEXT: s_mov_b64 exec, 0xff
+; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_writelane_b32 v4, s0, 0
+; GFX6-NEXT: v_writelane_b32 v4, s1, 1
+; GFX6-NEXT: v_writelane_b32 v4, s2, 2
+; GFX6-NEXT: v_writelane_b32 v4, s3, 3
+; GFX6-NEXT: v_writelane_b32 v4, s4, 4
+; GFX6-NEXT: v_writelane_b32 v4, s5, 5
+; GFX6-NEXT: v_writelane_b32 v4, s6, 6
+; GFX6-NEXT: v_writelane_b32 v4, s7, 7
+; GFX6-NEXT: s_mov_b32 s10, 0x81c00
+; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_mov_b64 exec, s[8:9]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; def s[0:3]
; GFX6-NEXT: ;;#ASMEND
@@ -9950,33 +9956,28 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: ; def s33
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GFX6-NEXT: s_mov_b64 vcc, s[6:7]
; GFX6-NEXT: s_cbranch_execz .LBB1_2
; GFX6-NEXT: ; %bb.1: ; %bb0
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
-; GFX6-NEXT: s_mov_b64 exec, 0xff
+; GFX6-NEXT: s_mov_b64 s[8:9], exec
+; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_writelane_b32 v4, s8, 0
-; GFX6-NEXT: v_writelane_b32 v4, s9, 1
-; GFX6-NEXT: v_writelane_b32 v4, s10, 2
-; GFX6-NEXT: v_writelane_b32 v4, s11, 3
-; GFX6-NEXT: v_writelane_b32 v4, s12, 4
-; GFX6-NEXT: v_writelane_b32 v4, s13, 5
-; GFX6-NEXT: v_writelane_b32 v4, s14, 6
-; GFX6-NEXT: v_writelane_b32 v4, s15, 7
-; GFX6-NEXT: s_mov_b32 s34, 0x81400
-; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
+; GFX6-NEXT: v_writelane_b32 v4, s12, 0
+; GFX6-NEXT: v_writelane_b32 v4, s13, 1
+; GFX6-NEXT: v_writelane_b32 v4, s14, 2
+; GFX6-NEXT: v_writelane_b32 v4, s15, 3
+; GFX6-NEXT: s_mov_b32 s10, 0x82400
+; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: s_mov_b64 exec, s[8:9]
+; GFX6-NEXT: s_mov_b64 s[20:21], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s34, 0x80c00
+; GFX6-NEXT: s_mov_b32 s22, 0x80400
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s22 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readlane_b32 s8, v4, 0
; GFX6-NEXT: v_readlane_b32 s9, v4, 1
@@ -9988,31 +9989,27 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_readlane_b32 s15, v4, 7
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
-; GFX6-NEXT: s_mov_b64 exec, 0xff
+; GFX6-NEXT: s_mov_b64 exec, s[20:21]
+; GFX6-NEXT: s_mov_b64 s[20:21], exec
+; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_writelane_b32 v4, s16, 0
; GFX6-NEXT: v_writelane_b32 v4, s17, 1
; GFX6-NEXT: v_writelane_b32 v4, s18, 2
; GFX6-NEXT: v_writelane_b32 v4, s19, 3
-; GFX6-NEXT: v_writelane_b32 v4, s20, 4
-; GFX6-NEXT: v_writelane_b32 v4, s21, 5
-; GFX6-NEXT: v_writelane_b32 v4, s22, 6
-; GFX6-NEXT: v_writelane_b32 v4, s23, 7
-; GFX6-NEXT: s_mov_b32 s34, 0x81c00
-; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
+; GFX6-NEXT: s_mov_b32 s22, 0x82c00
+; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s22 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: s_mov_b64 exec, s[20:21]
+; GFX6-NEXT: s_mov_b64 s[24:25], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s34, 0x81400
+; GFX6-NEXT: s_mov_b32 s26, 0x80c00
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s26 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readlane_b32 s16, v4, 0
; GFX6-NEXT: v_readlane_b32 s17, v4, 1
@@ -10024,31 +10021,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_readlane_b32 s23, v4, 7
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
-; GFX6-NEXT: s_mov_b64 exec, 0xff
-; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_writelane_b32 v4, s24, 0
-; GFX6-NEXT: v_writelane_b32 v4, s25, 1
-; GFX6-NEXT: v_writelane_b32 v4, s26, 2
-; GFX6-NEXT: v_writelane_b32 v4, s27, 3
-; GFX6-NEXT: v_writelane_b32 v4, s28, 4
-; GFX6-NEXT: v_writelane_b32 v4, s29, 5
-; GFX6-NEXT: v_writelane_b32 v4, s30, 6
-; GFX6-NEXT: v_writelane_b32 v4, s31, 7
-; GFX6-NEXT: s_mov_b32 s34, 0x82400
-; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: s_mov_b64 exec, s[24:25]
+; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s34, 0x81c00
+; GFX6-NEXT: s_mov_b32 s36, 0x81400
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readlane_b32 s24, v4, 0
; GFX6-NEXT: v_readlane_b32 s25, v4, 1
@@ -10060,8 +10039,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_readlane_b32 s31, v4, 7
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: s_mov_b64 exec, s[34:35]
+; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -10069,12 +10048,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: v_writelane_b32 v4, s1, 1
; GFX6-NEXT: v_writelane_b32 v4, s2, 2
; GFX6-NEXT: v_writelane_b32 v4, s3, 3
-; GFX6-NEXT: s_mov_b32 s34, 0x82c00
-; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
+; GFX6-NEXT: s_mov_b32 s36, 0x82800
+; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_mov_b64 exec, s[34:35]
; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: s_mov_b64 exec, 3
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
@@ -10087,10 +10066,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_mov_b64 exec, s[0:1]
+; GFX6-NEXT: s_mov_b64 vcc, s[6:7]
; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s36, 0x82400
+; GFX6-NEXT: s_mov_b32 s36, 0x81c00
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10108,7 +10088,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s44, 0x82c00
+; GFX6-NEXT: s_mov_b32 s44, 0x82800
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -10134,84 +10114,102 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35]
; GFX6-NEXT: ;;#ASMEND
-; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ;;#ASMEND
-; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ;;#ASMEND
-; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ;;#ASMEND
-; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ;;#ASMEND
-; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ;;#ASMEND
-; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ;;#ASMEND
-; GFX6-NEXT: ;;#ASMSTART
-; GFX6-NEXT: ;;#ASMEND
-; GFX6-NEXT: .LBB1_2: ; %ret
-; GFX6-NEXT: s_or_b64 exec, exec, vcc
-; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: s_mov_b64 s[6:7], vcc
+; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s6, 0x80400
+; GFX6-NEXT: s_mov_b32 s2, 0x82c00
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readlane_b32 s0, v4, 0
-; GFX6-NEXT: v_readlane_b32 s1, v4, 1
-; GFX6-NEXT: v_readlane_b32 s2, v4, 2
-; GFX6-NEXT: v_readlane_b32 s3, v4, 3
+; GFX6-NEXT: v_readlane_b32 s16, v4, 0
+; GFX6-NEXT: v_readlane_b32 s17, v4, 1
+; GFX6-NEXT: v_readlane_b32 s18, v4, 2
+; GFX6-NEXT: v_readlane_b32 s19, v4, 3
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[4:5]
-; GFX6-NEXT: s_mov_b64 s[36:37], s[0:1]
-; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: s_mov_b64 exec, s[0:1]
+; GFX6-NEXT: s_mov_b64 s[0:1], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT: s_mov_b32 s6, 0x80800
+; GFX6-NEXT: s_mov_b32 s2, 0x82400
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readlane_b32 s0, v4, 0
-; GFX6-NEXT: v_readlane_b32 s1, v4, 1
-; GFX6-NEXT: v_readlane_b32 s2, v4, 2
-; GFX6-NEXT: v_readlane_b32 s3, v4, 3
+; GFX6-NEXT: v_readlane_b32 s12, v4, 0
+; GFX6-NEXT: v_readlane_b32 s13, v4, 1
+; GFX6-NEXT: v_readlane_b32 s14, v4, 2
+; GFX6-NEXT: v_readlane_b32 s15, v4, 3
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: s_mov_b64 exec, s[4:5]
-; GFX6-NEXT: s_mov_b32 s0, 0x86a00
-; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3]
+; GFX6-NEXT: s_mov_b64 exec, s[0:1]
+; GFX6-NEXT: s_mov_b32 s0, 0x86200
+; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
; GFX6-NEXT: s_mov_b32 s0, 0x86600
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240
+; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: s_mov_b32 s0, 0x86a00
+; GFX6-NEXT: s_waitcnt expcnt(4)
+; GFX6-NEXT: v_mov_b32_e32 v0, v20
+; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s0 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT: v_mov_b32_e32 v1, v21
+; GFX6-NEXT: v_mov_b32_e32 v2, v22
+; GFX6-NEXT: v_mov_b32_e32 v3, v23
+; GFX6-NEXT: s_waitcnt expcnt(3)
+; GFX6-NEXT: ;;#ASMSTART
+; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: s_waitcnt expcnt(2)
+; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: s_waitcnt expcnt(1)
+; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: s_mov_b32 s0, 0x86600
+; GFX6-NEXT: v_mov_b32_e32 v23, v3
+; GFX6-NEXT: buffer_load_dword v12, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x86200
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: v_mov_b32_e32 v22, v2
+; GFX6-NEXT: v_mov_b32_e32 v21, v1
+; GFX6-NEXT: v_mov_b32_e32 v20, v0
+; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: ;;#ASMSTART
+; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: ;;#ASMSTART
+; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: ;;#ASMSTART
+; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: ;;#ASMSTART
+; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: ;;#ASMSTART
+; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: ;;#ASMSTART
+; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: .LBB1_2: ; %ret
+; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_mov_b32 s0, 0x85e00
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208
-; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT: s_mov_b64 s[14:15], s[18:19]
; GFX6-NEXT: s_mov_b32 s0, 0x85a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:240
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@@ -10219,7 +10217,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x85600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:224
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@@ -10227,7 +10225,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x85200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:208
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@@ -10235,7 +10233,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:192
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@@ -10243,7 +10241,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:176
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@@ -10251,7 +10249,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:160
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@@ -10259,23 +10257,23 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x84200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:144
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x83a00
+; GFX6-NEXT: s_mov_b32 s0, 0x83e00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:128
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT: s_mov_b32 s0, 0x83e00
+; GFX6-NEXT: s_mov_b32 s0, 0x83a00
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:112
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@@ -10283,7 +10281,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83600
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:96
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
@@ -10291,15 +10289,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_mov_b32 s0, 0x83200
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32
-; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:80
+; GFX6-NEXT: buffer_store_dwordx4 v[20:23], v[5:6], s[12:15], 0 addr64 offset:64
+; GFX6-NEXT: buffer_store_dwordx4 v[16:19], v[5:6], s[12:15], 0 addr64 offset:48
+; GFX6-NEXT: buffer_store_dwordx4 v[12:15], v[5:6], s[12:15], 0 addr64 offset:32
+; GFX6-NEXT: s_waitcnt expcnt(3)
; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16
-; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64
+; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:16
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[12:15], 0 addr64
; GFX6-NEXT: s_endpgm
;
; GFX9-FLATSCR-LABEL: test_limited_sgpr:
diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index 3913e93b83a66..1dddc29deae25 100644
--- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -31,22 +31,23 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: .cfi_offset %edi, -16
; CHECK-NEXT: .cfi_offset %ebx, -12
; CHECK-NEXT: .cfi_offset %ebp, -8
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; CHECK-NEXT: testb $1, %bl
-; CHECK-NEXT: je LBB0_25
+; CHECK-NEXT: je LBB0_7
; CHECK-NEXT: ## %bb.1: ## %bb116.i
-; CHECK-NEXT: je LBB0_25
+; CHECK-NEXT: je LBB0_7
; CHECK-NEXT: ## %bb.2: ## %bb52.i.i
-; CHECK-NEXT: je LBB0_25
+; CHECK-NEXT: je LBB0_7
; CHECK-NEXT: ## %bb.3: ## %bb142.i
-; CHECK-NEXT: je LBB0_25
+; CHECK-NEXT: je LBB0_7
; CHECK-NEXT: ## %bb.4:
+; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: movl L_.str89$non_lazy_ptr, %edi
; CHECK-NEXT: movb $1, %bh
; CHECK-NEXT: movl L_.str$non_lazy_ptr, %ebp
; CHECK-NEXT: jmp LBB0_5
-; CHECK-NEXT: LBB0_21: ## %bb7806
+; CHECK-NEXT: LBB0_23: ## %bb7806
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp16: ## EH_LABEL
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -57,50 +58,50 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: LBB0_5: ## %bb3261
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cmpl $37, 0
-; CHECK-NEXT: jne LBB0_25
-; CHECK-NEXT: ## %bb.6: ## %bb3306
+; CHECK-NEXT: jne LBB0_6
+; CHECK-NEXT: ## %bb.8: ## %bb3306
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp0: ## EH_LABEL
; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: calll __ZN12wxStringBaseaSEPKw
; CHECK-NEXT: Ltmp1: ## EH_LABEL
-; CHECK-NEXT: ## %bb.7: ## %bb3314
+; CHECK-NEXT: ## %bb.9: ## %bb3314
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: movl 0, %eax
; CHECK-NEXT: cmpl $121, %eax
-; CHECK-NEXT: ja LBB0_25
-; CHECK-NEXT: ## %bb.8: ## %bb3314
+; CHECK-NEXT: ja LBB0_6
+; CHECK-NEXT: ## %bb.10: ## %bb3314
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: jmpl *LJTI0_0(,%eax,4)
-; CHECK-NEXT: LBB0_10: ## %bb5809
+; CHECK-NEXT: LBB0_12: ## %bb5809
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jne LBB0_25
-; CHECK-NEXT: ## %bb.11: ## %bb5809
+; CHECK-NEXT: jne LBB0_6
+; CHECK-NEXT: ## %bb.13: ## %bb5809
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: testb %bh, %bh
-; CHECK-NEXT: je LBB0_25
-; CHECK-NEXT: ## %bb.12: ## %bb91.i8504
+; CHECK-NEXT: je LBB0_6
+; CHECK-NEXT: ## %bb.14: ## %bb91.i8504
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: testb $1, %bl
-; CHECK-NEXT: je LBB0_14
-; CHECK-NEXT: ## %bb.13: ## %bb155.i8541
+; CHECK-NEXT: je LBB0_16
+; CHECK-NEXT: ## %bb.15: ## %bb155.i8541
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp4: ## EH_LABEL
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: calll _gmtime_r
; CHECK-NEXT: Ltmp5: ## EH_LABEL
-; CHECK-NEXT: LBB0_14: ## %bb182.i8560
+; CHECK-NEXT: LBB0_16: ## %bb182.i8560
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: testb $1, %bl
-; CHECK-NEXT: je LBB0_15
-; CHECK-NEXT: ## %bb.16: ## %bb278.i8617
+; CHECK-NEXT: je LBB0_17
+; CHECK-NEXT: ## %bb.18: ## %bb278.i8617
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT: je LBB0_18
-; CHECK-NEXT: ## %bb.17: ## %bb440.i8663
+; CHECK-NEXT: je LBB0_20
+; CHECK-NEXT: ## %bb.19: ## %bb440.i8663
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp6: ## EH_LABEL
; CHECK-NEXT: movl L_.str4$non_lazy_ptr, %eax
@@ -113,11 +114,11 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: movl $1717, {{[0-9]+}}(%esp) ## imm = 0x6B5
; CHECK-NEXT: calll __Z10wxOnAssertPKwiPKcS0_S0_
; CHECK-NEXT: Ltmp7: ## EH_LABEL
-; CHECK-NEXT: jmp LBB0_18
-; CHECK-NEXT: LBB0_15: ## %bb187.i8591
+; CHECK-NEXT: jmp LBB0_20
+; CHECK-NEXT: LBB0_17: ## %bb187.i8591
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT: jne LBB0_25
-; CHECK-NEXT: LBB0_18: ## %invcont5814
+; CHECK-NEXT: jne LBB0_6
+; CHECK-NEXT: LBB0_20: ## %invcont5814
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp8: ## EH_LABEL
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -126,7 +127,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz
; CHECK-NEXT: subl $4, %esp
; CHECK-NEXT: Ltmp9: ## EH_LABEL
-; CHECK-NEXT: ## %bb.19: ## %invcont5831
+; CHECK-NEXT: ## %bb.21: ## %invcont5831
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: Ltmp10: ## EH_LABEL
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -136,7 +137,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: calll __ZN12wxStringBase10ConcatSelfEmPKwm
; CHECK-NEXT: Ltmp11: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_5
-; CHECK-NEXT: LBB0_9: ## %bb5657
+; CHECK-NEXT: LBB0_11: ## %bb5657
; CHECK-NEXT: Ltmp13: ## EH_LABEL
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -144,8 +145,8 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: movl %eax, (%esp)
; CHECK-NEXT: calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE
; CHECK-NEXT: Ltmp14: ## EH_LABEL
-; CHECK-NEXT: jmp LBB0_25
-; CHECK-NEXT: LBB0_20: ## %bb5968
+; CHECK-NEXT: jmp LBB0_6
+; CHECK-NEXT: LBB0_22: ## %bb5968
; CHECK-NEXT: Ltmp2: ## EH_LABEL
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -153,23 +154,24 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz
; CHECK-NEXT: subl $4, %esp
; CHECK-NEXT: Ltmp3: ## EH_LABEL
-; CHECK-NEXT: LBB0_25: ## %bb115.critedge.i
+; CHECK-NEXT: LBB0_6: ## %bb3267
; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: LBB0_7: ## %bb115.critedge.i
; CHECK-NEXT: addl $28, %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl $4
-; CHECK-NEXT: LBB0_23: ## %lpad.loopexit.split-lp
+; CHECK-NEXT: LBB0_25: ## %lpad.loopexit.split-lp
; CHECK-NEXT: Ltmp15: ## EH_LABEL
-; CHECK-NEXT: jmp LBB0_25
-; CHECK-NEXT: LBB0_24: ## %lpad8185
+; CHECK-NEXT: jmp LBB0_6
+; CHECK-NEXT: LBB0_26: ## %lpad8185
; CHECK-NEXT: Ltmp12: ## EH_LABEL
-; CHECK-NEXT: jmp LBB0_25
-; CHECK-NEXT: LBB0_22: ## %lpad.loopexit
+; CHECK-NEXT: jmp LBB0_6
+; CHECK-NEXT: LBB0_24: ## %lpad.loopexit
; CHECK-NEXT: Ltmp18: ## EH_LABEL
-; CHECK-NEXT: jmp LBB0_25
+; CHECK-NEXT: jmp LBB0_6
; CHECK-NEXT: Lfunc_end0:
entry:
br i1 %foo, label %bb116.i, label %bb115.critedge.i
>From 900e06d80cf8fdb7c074374c656a4d1948b7c245 Mon Sep 17 00:00:00 2001
From: mitchell <mitchell.xu2 at gmail.com>
Date: Sat, 29 Nov 2025 10:36:01 +0800
Subject: [PATCH 5/7] [clang-tidy] Fix OOB access in `FormatStringConverter`
with signed chars (#169215)
`FormatStringConverter::appendFormatText` incorrectly treated non-ASCII
characters (e.g. UTF-8) as negative values when using signed chars. This
caused them to pass the `< 32` check for control characters.
The negative values were passed to `llvm::hexdigit`, resulting in an OOB
access and a crash.
This closes
[#169198](https://github.com/llvm/llvm-project/issues/169198)
---
.../clang-tidy/utils/FormatStringConverter.cpp | 7 ++++---
clang-tools-extra/docs/ReleaseNotes.rst | 7 ++++---
.../test/clang-tidy/check_clang_tidy.py | 2 ++
.../clang-tidy/checkers/modernize/use-std-print.cpp | 12 ++++++++++++
4 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp b/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp
index 23dae04916e9b..d210b000dfd33 100644
--- a/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/FormatStringConverter.cpp
@@ -700,6 +700,7 @@ void FormatStringConverter::finalizeFormatText() {
/// Append literal parts of the format text, reinstating escapes as required.
void FormatStringConverter::appendFormatText(const StringRef Text) {
for (const char Ch : Text) {
+ const auto UCh = static_cast<unsigned char>(Ch);
if (Ch == '\a')
StandardFormatString += "\\a";
else if (Ch == '\b')
@@ -724,10 +725,10 @@ void FormatStringConverter::appendFormatText(const StringRef Text) {
} else if (Ch == '}') {
StandardFormatString += "}}";
FormatStringNeededRewriting = true;
- } else if (Ch < 32) {
+ } else if (UCh < 32) {
StandardFormatString += "\\x";
- StandardFormatString += llvm::hexdigit(Ch >> 4, true);
- StandardFormatString += llvm::hexdigit(Ch & 0xf, true);
+ StandardFormatString += llvm::hexdigit(UCh >> 4, true);
+ StandardFormatString += llvm::hexdigit(UCh & 0xf, true);
} else
StandardFormatString += Ch;
}
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index a6f80e3721db1..644c5cb573cf7 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -69,7 +69,7 @@ Potentially Breaking Changes
- `CharTypdefsToIgnore` to `CharTypedefsToIgnore` in
:doc:`bugprone-signed-char-misuse
<clang-tidy/checks/bugprone/signed-char-misuse>`
-
+
- Modified the custom message format of :doc:`bugprone-unsafe-functions
<clang-tidy/checks/bugprone/unsafe-functions>` by assigning a special meaning
to the character ``>`` at the start of the value of the option
@@ -394,7 +394,7 @@ Changes in existing checks
<clang-tidy/checks/bugprone/unhandled-self-assignment>` check by adding
an additional matcher that generalizes the copy-and-swap idiom pattern
detection.
-
+
- Improved :doc:`bugprone-unsafe-functions
<clang-tidy/checks/bugprone/unsafe-functions>` check by hiding the default
suffix when the reason starts with the character `>` in the `CustomFunctions`
@@ -497,7 +497,8 @@ Changes in existing checks
- Improved :doc:`modernize-use-std-print
<clang-tidy/checks/modernize/use-std-print>` check to correctly match
when the format string is converted to a different type by an implicit
- constructor call.
+ constructor call, and fixed a crash when handling format strings
+ containing non-ASCII characters.
- Improved :doc:`performance-unnecessary-copy-initialization
<clang-tidy/checks/performance/unnecessary-copy-initialization>` by printing
diff --git a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
index 183b33f135be8..b173ecf4fbdca 100755
--- a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
+++ b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
@@ -398,6 +398,8 @@ def parse_arguments() -> Tuple[argparse.Namespace, List[str]]:
def main() -> None:
+ sys.stdout.reconfigure(encoding="utf-8")
+ sys.stderr.reconfigure(encoding="utf-8")
args, extra_args = parse_arguments()
abbreviated_stds = args.std
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp
index ec37f077df7fc..63972cc0fd25e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp
@@ -54,6 +54,12 @@ void printf_deceptive_newline() {
// CHECK-FIXES: std::println("Hello");
}
+void printf_utf8_text() {
+ printf("你好世界\n");
+ // CHECK-MESSAGES: [[@LINE-1]]:3: warning: use 'std::println' instead of 'printf' [modernize-use-std-print]
+ // CHECK-FIXES: std::println("你好世界");
+}
+
void printf_crlf_newline() {
printf("Hello\r\n");
// CHECK-MESSAGES: [[@LINE-1]]:3: warning: use 'std::print' instead of 'printf' [modernize-use-std-print]
@@ -303,6 +309,12 @@ void fprintf_simple() {
// CHECK-FIXES: std::print(stderr, "Hello");
}
+void fprintf_utf8_text() {
+ fprintf(stderr, "你好世界\n");
+ // CHECK-MESSAGES: [[@LINE-1]]:3: warning: use 'std::println' instead of 'fprintf' [modernize-use-std-print]
+ // CHECK-FIXES: std::println(stderr, "你好世界");
+}
+
void std_printf_simple() {
std::printf("std::Hello");
// CHECK-MESSAGES: [[@LINE-1]]:3: warning: use 'std::print' instead of 'printf' [modernize-use-std-print]
>From a23bde089ec86b3033a7c4dd3a1734806e5c1642 Mon Sep 17 00:00:00 2001
From: Qihan Cai <caiqihan021 at hotmail.com>
Date: Sat, 29 Nov 2025 19:07:19 +1100
Subject: [PATCH 6/7] [RISCV] Intrinsic Support for XCVelw (#129168)
---
llvm/include/llvm/IR/IntrinsicsRISCVXCV.td | 4 +++
.../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 3 +++
llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td | 9 ++++++-
llvm/test/CodeGen/RISCV/xcvelw.ll | 27 +++++++++++++++++++
llvm/test/MC/RISCV/corev/XCVelw-pseudo.s | 11 ++++++++
5 files changed, 53 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/RISCV/xcvelw.ll
create mode 100644 llvm/test/MC/RISCV/corev/XCVelw-pseudo.s
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td b/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td
index 9f6a9964903ae..465665c838bae 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCVXCV.td
@@ -90,4 +90,8 @@ let TargetPrefix = "riscv" in {
def int_riscv_cv_mac_machhuRN : ScalarCoreVMacGprGprGprImmIntrinsic;
def int_riscv_cv_mac_macsRN : ScalarCoreVMacGprGprGprImmIntrinsic;
def int_riscv_cv_mac_machhsRN : ScalarCoreVMacGprGprGprImmIntrinsic;
+
+ def int_riscv_cv_elw_elw
+ : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+ [IntrReadMem, IntrArgMemOnly, IntrHasSideEffects]>;
} // TargetPrefix = "riscv"
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 75ce1b144a2e7..9bb3724c96c11 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -4082,6 +4082,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
return false;
}
+ case RISCV::PseudoCV_ELW:
+ emitLoadStoreSymbol(Inst, RISCV::CV_ELW, IDLoc, Out, /*HasTmpReg=*/false);
+ return false;
}
emitToStreamer(Out, Inst);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
index aa8f1a1108b6b..7abc616f03141 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td
@@ -633,8 +633,9 @@ let Predicates = [HasVendorXCVmem, IsRV32] in {
def CV_SW_rr : CVStore_rr<0b011, 0b0010110, "cv.sw">;
}
-let Predicates = [HasVendorXCVelw, IsRV32], hasSideEffects = 0,
+let Predicates = [HasVendorXCVelw, IsRV32], hasSideEffects = 1,
mayLoad = 1, mayStore = 0 in {
+ def PseudoCV_ELW : PseudoLoad<"cv.elw">;
// Event load
def CV_ELW : CVLoad_ri<0b011, "cv.elw">;
}
@@ -706,6 +707,12 @@ let Predicates = [HasVendorXCVmem, IsRV32], AddedComplexity = 1 in {
def : CVStrrPat<store, CV_SW_rr>;
}
+let Predicates = [HasVendorXCVelw, IsRV32] in {
+ def : Pat<(int_riscv_cv_elw_elw (XLenVT GPR:$rs1)), (PseudoCV_ELW GPR:$rs1)>;
+ def : Pat<(int_riscv_cv_elw_elw (AddrRegImm (XLenVT GPR:$rs1), simm12_lo:$imm12)),
+ (CV_ELW GPR:$rs1, simm12_lo:$imm12)>;
+}
+
multiclass PatCoreVBitManip<Intrinsic intr> {
def : PatGprGpr<intr, !cast<RVInst>("CV_" # NAME # "R")>;
def : Pat<(intr GPR:$rs1, cv_uimm10:$imm),
diff --git a/llvm/test/CodeGen/RISCV/xcvelw.ll b/llvm/test/CodeGen/RISCV/xcvelw.ll
new file mode 100644
index 0000000000000..4ff8a5b38494f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xcvelw.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=riscv32 -mattr=+xcvelw -verify-machineinstrs < %s \
+; RUN: | FileCheck %s
+
+declare i32 @llvm.riscv.cv.elw.elw(i8*)
+
+define i32 @test.cv.elw.elw(i8* %a) {
+; CHECK-LABEL: test.cv.elw.elw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: cv.elw a0, 0(a0)
+; CHECK-NEXT: ret
+ %1 = call i32 @llvm.riscv.cv.elw.elw(i8* %a)
+ ret i32 %1
+}
+
+define i32 @test.cv.elw.elw2(i8* %a, i32 %b) {
+; CHECK-LABEL: test.cv.elw.elw2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: cv.elw a0, 7(a0)
+; CHECK-NEXT: ret
+ %c = add i32 %b, 4
+ %d = add i32 %c, 3
+ %e = getelementptr i8, i8* %a, i32 %d
+ %1 = call i32 @llvm.riscv.cv.elw.elw(i8* %e)
+ ret i32 %1
+}
\ No newline at end of file
diff --git a/llvm/test/MC/RISCV/corev/XCVelw-pseudo.s b/llvm/test/MC/RISCV/corev/XCVelw-pseudo.s
new file mode 100644
index 0000000000000..172ebfde9f338
--- /dev/null
+++ b/llvm/test/MC/RISCV/corev/XCVelw-pseudo.s
@@ -0,0 +1,11 @@
+# RUN: llvm-mc %s -triple=riscv32 --mattr=+xcvelw | FileCheck %s
+
+# CHECK: .Lpcrel_hi0:
+# CHECK: auipc a2, %pcrel_hi(a_symbol)
+# CHECK: cv.elw a2, %pcrel_lo(.Lpcrel_hi0)(a2)
+cv.elw a2, a_symbol
+
+# CHECK: .Lpcrel_hi1:
+# CHECK: auipc a3, %pcrel_hi(a_symbol)
+# CHECK: cv.elw a3, %pcrel_lo(.Lpcrel_hi1)(a3)
+cv.elw a3, a_symbol
>From fe22d30c77f9eafadee6469dc6758336bc6bea1c Mon Sep 17 00:00:00 2001
From: wermos <63574588+wermos at users.noreply.github.com>
Date: Sat, 29 Nov 2025 01:55:58 +0530
Subject: [PATCH 7/7] Pre-commit tests
---
llvm/test/Transforms/InstCombine/icmp-add.ll | 39 ++++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll
index 8449c7c5ea935..943830a9f42e9 100644
--- a/llvm/test/Transforms/InstCombine/icmp-add.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-add.ll
@@ -3440,3 +3440,42 @@ define i1 @val_is_aligend_pred_mismatch(i32 %num) {
%_0 = icmp sge i32 %num.masked, %num
ret i1 %_0
}
+
+define i1 @icmp_samesign_with_nsw_add(i32 %arg0) {
+; CHECK-LABEL: @icmp_samesign_with_nsw_add(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[ARG0:%.*]], -26
+; CHECK-NEXT: [[V1:%.*]] = icmp ult i32 [[TMP0]], -8
+; CHECK-NEXT: ret i1 [[V1]]
+;
+entry:
+ %v0 = add nsw i32 %arg0, -18
+ %v1 = icmp samesign ugt i32 %v0, 7
+ ret i1 %v1
+}
+
+; Shouldn't fire since -124 - 12 causes signed overflow
+define i1 @icmp_samesign_with_nsw_add_no_fire(i8 %arg0) {
+; CHECK-LABEL: @icmp_samesign_with_nsw_add_no_fire(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i8 [[ARG0:%.*]], -121
+; CHECK-NEXT: [[V1:%.*]] = icmp ult i8 [[TMP0]], 123
+; CHECK-NEXT: ret i1 [[V1]]
+;
+entry:
+ %v0 = add nsw i8 %arg0, 12
+ %v1 = icmp samesign ugt i8 %v0, -124
+ ret i1 %v1
+}
+
+define i1 @icmp_with_nuw_add(i32 %arg0) {
+; CHECK-LABEL: @icmp_with_nuw_add(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[V1:%.*]] = icmp ugt i32 [[ARG0:%.*]], 11
+; CHECK-NEXT: ret i1 [[V1]]
+;
+entry:
+ %v0 = add nuw i32 %arg0, 7
+ %v1 = icmp ugt i32 %v0, 18
+ ret i1 %v1
+}
More information about the llvm-commits
mailing list