[llvm] Revert "[LSV] Merge contiguous chains across scalar types" (PR #170381)
Drew Kersnar via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 2 14:24:53 PST 2025
https://github.com/dakersnar created https://github.com/llvm/llvm-project/pull/170381
Reverts llvm/llvm-project#154069. I pointed out a number of issues post-merge, most importantly examples of miscompiles: https://github.com/llvm/llvm-project/pull/154069#issuecomment-3603854626.
While the motivation of the change is clear, I think the implementation approach is flawed. It seems like the goal is to allow elements like `load <2xi16>` and `load i32` to be vectorized together despite the current algorithm not grouping them into the same equivalence classes. I personally think that if we want to attempt this it should be a more wholistic approach, maybe even redefining the concept of an equivalence class. This current solution seems like it would be really hard to do bug-free, and even if the bugs were present, it is only able to merge chains that happen to be adjacent to each other after `splitChainByContiguity`. But we can discuss more in the re-land. Maybe the broader approach I'm proposing is too difficult, and a narrow optimization is worthwhile. Regardless, this should be reverted, it needs more iteration before it is correct.
>From 8e57b16bfa9642bdc07b8ef4142604a3d1ab87ef Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dakersnar at me.com>
Date: Tue, 2 Dec 2025 16:14:02 -0600
Subject: [PATCH] Revert "[LSV] Merge contiguous chains across scalar types
(#154069)"
This reverts commit fbdf8ab59005bc35f23b3167e0783013c7ee5fa4.
---
llvm/include/llvm/Transforms/Utils/Local.h | 2 +-
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 2 +-
.../InstCombineLoadStoreAlloca.cpp | 4 +-
llvm/lib/Transforms/Scalar/SROA.cpp | 2 +-
llvm/lib/Transforms/Utils/Local.cpp | 60 +-
.../Vectorize/LoadStoreVectorizer.cpp | 282 +--
.../AMDGPU/GlobalISel/amdgpu-irtranslator.ll | 2 -
.../AMDGPU/agpr-copy-no-free-registers.ll | 262 ++-
.../AMDGPU/amdgpu-codegenprepare-idiv.ll | 1617 ++++++++---------
llvm/test/CodeGen/AMDGPU/build_vector.ll | 31 +-
llvm/test/CodeGen/AMDGPU/fabs.bf16.ll | 58 +-
llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 28 +-
llvm/test/CodeGen/AMDGPU/fabs.ll | 31 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 54 +-
llvm/test/CodeGen/AMDGPU/fdiv.ll | 333 ++--
llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 23 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll | 84 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 14 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 31 +-
llvm/test/CodeGen/AMDGPU/fneg.ll | 31 +-
llvm/test/CodeGen/AMDGPU/fp_to_sint.ll | 106 +-
llvm/test/CodeGen/AMDGPU/fp_to_uint.ll | 69 +-
llvm/test/CodeGen/AMDGPU/fshl.ll | 290 +--
llvm/test/CodeGen/AMDGPU/fshr.ll | 403 ++--
llvm/test/CodeGen/AMDGPU/global_atomics.ll | 82 -
llvm/test/CodeGen/AMDGPU/half.ll | 174 +-
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 54 +-
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 12 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 24 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 312 ++--
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 117 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 117 +-
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 67 +-
llvm/test/CodeGen/AMDGPU/llvm.log.ll | 113 +-
llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 113 +-
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 26 +-
llvm/test/CodeGen/AMDGPU/max.ll | 34 +-
llvm/test/CodeGen/AMDGPU/min.ll | 216 +--
llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 88 +-
llvm/test/CodeGen/AMDGPU/rotl.ll | 74 +-
llvm/test/CodeGen/AMDGPU/rotr.ll | 58 +-
llvm/test/CodeGen/AMDGPU/s_addk_i32.ll | 2 +-
llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 127 +-
llvm/test/CodeGen/AMDGPU/store-to-constant.ll | 6 +-
llvm/test/CodeGen/AMDGPU/udivrem.ll | 126 +-
llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 2 +-
.../InstCombine/copy-access-metadata.ll | 215 ---
.../AMDGPU/copy-metadata-load-store.ll | 159 --
.../AMDGPU/merge-vectors-complex.ll | 324 +---
.../AMDGPU/merge-vectors.ll | 284 +--
50 files changed, 2766 insertions(+), 3979 deletions(-)
delete mode 100644 llvm/test/Transforms/InstCombine/copy-access-metadata.ll
delete mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/copy-metadata-load-store.ll
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index d0af2d3d2e4c2..9acfd872e574b 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -431,7 +431,7 @@ LLVM_ABI void combineAAMetadata(Instruction *K, const Instruction *J);
/// Copy the metadata from the source instruction to the destination (the
/// replacement for the source instruction).
-LLVM_ABI void copyMetadataForAccess(Instruction &Dest, Instruction &Source);
+LLVM_ABI void copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source);
/// Patch the replacement so that it is not more restrictive than the value
/// being replaced. It assumes that the replacement does not get moved from
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 98884c441096e..fdff21b6ef8df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1035,7 +1035,7 @@ bool LegalizeBufferContentTypesVisitor::visitLoadImpl(
LoadInst *NewLI = IRB.CreateAlignedLoad(
LoadableType, NewPtr, commonAlignment(OrigLI.getAlign(), ByteOffset),
Name + ".off." + Twine(ByteOffset));
- copyMetadataForAccess(*NewLI, OrigLI);
+ copyMetadataForLoad(*NewLI, OrigLI);
NewLI->setAAMetadata(
AANodes.adjustForAccess(ByteOffset, LoadableType, DL));
NewLI->setAtomic(OrigLI.getOrdering(), OrigLI.getSyncScopeID());
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 3e04aeb675d2a..9491610190c10 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -415,7 +415,7 @@ void PointerReplacer::replace(Instruction *I) {
LT->getAlign(), LT->getOrdering(),
LT->getSyncScopeID());
NewI->takeName(LT);
- copyMetadataForAccess(*NewI, *LT);
+ copyMetadataForLoad(*NewI, *LT);
IC.InsertNewInstWith(NewI, LT->getIterator());
IC.replaceInstUsesWith(*LT, NewI);
@@ -606,7 +606,7 @@ LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
Builder.CreateAlignedLoad(NewTy, LI.getPointerOperand(), LI.getAlign(),
LI.isVolatile(), LI.getName() + Suffix);
NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
- copyMetadataForAccess(*NewLoad, LI);
+ copyMetadataForLoad(*NewLoad, LI);
return NewLoad;
}
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index a7c322bfcb981..70afe833c9f47 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3272,7 +3272,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
// Copy any metadata that is valid for the new load. This may require
// conversion to a different kind of metadata, e.g. !nonnull might change
// to !range or vice versa.
- copyMetadataForAccess(*NewLI, LI);
+ copyMetadataForLoad(*NewLI, LI);
// Do this after copyMetadataForLoad() to preserve the TBAA shift.
if (AATags)
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index dec2e019333b9..a03cf6e953e35 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3100,70 +3100,54 @@ void llvm::combineAAMetadata(Instruction *K, const Instruction *J) {
combineMetadata(K, J, /*DoesKMove=*/true, /*AAOnly=*/true);
}
-void llvm::copyMetadataForAccess(Instruction &DestI, Instruction &SourceI) {
+void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
- SourceI.getAllMetadata(MD);
- MDBuilder MDB(DestI.getContext());
- Type *NewType = DestI.getType();
-
- // Only needed for range metadata on loads.
- const DataLayout *DL = nullptr;
- const LoadInst *LSource = dyn_cast<LoadInst>(&SourceI);
- if (LSource)
- DL = &LSource->getDataLayout();
-
+ Source.getAllMetadata(MD);
+ MDBuilder MDB(Dest.getContext());
+ Type *NewType = Dest.getType();
+ const DataLayout &DL = Source.getDataLayout();
for (const auto &MDPair : MD) {
unsigned ID = MDPair.first;
MDNode *N = MDPair.second;
-
+ // Note, essentially every kind of metadata should be preserved here! This
+ // routine is supposed to clone a load instruction changing *only its type*.
+ // The only metadata it makes sense to drop is metadata which is invalidated
+ // when the pointer type changes. This should essentially never be the case
+ // in LLVM, but we explicitly switch over only known metadata to be
+ // conservatively correct. If you are adding metadata to LLVM which pertains
+ // to loads, you almost certainly want to add it here.
switch (ID) {
- // Applies to both loads and stores as-is.
case LLVMContext::MD_dbg:
+ case LLVMContext::MD_tbaa:
case LLVMContext::MD_prof:
+ case LLVMContext::MD_fpmath:
case LLVMContext::MD_tbaa_struct:
+ case LLVMContext::MD_invariant_load:
case LLVMContext::MD_alias_scope:
case LLVMContext::MD_noalias:
case LLVMContext::MD_nontemporal:
+ case LLVMContext::MD_mem_parallel_loop_access:
case LLVMContext::MD_access_group:
case LLVMContext::MD_noundef:
case LLVMContext::MD_noalias_addrspace:
- case LLVMContext::MD_mem_parallel_loop_access:
- DestI.setMetadata(ID, N);
- break;
-
- // Load-only metadata.
- case LLVMContext::MD_fpmath:
- case LLVMContext::MD_invariant_load:
- if (isa<LoadInst>(DestI))
- DestI.setMetadata(ID, N);
+ // All of these directly apply.
+ Dest.setMetadata(ID, N);
break;
case LLVMContext::MD_nonnull:
- if (auto *LDest = dyn_cast<LoadInst>(&DestI)) {
- if (LSource)
- copyNonnullMetadata(*LSource, N, *LDest);
- }
+ copyNonnullMetadata(Source, N, Dest);
break;
case LLVMContext::MD_align:
case LLVMContext::MD_dereferenceable:
case LLVMContext::MD_dereferenceable_or_null:
- // Applies to both loads and stores only if the new type is also a
- // pointer.
+ // These only directly apply if the new type is also a pointer.
if (NewType->isPointerTy())
- DestI.setMetadata(ID, N);
+ Dest.setMetadata(ID, N);
break;
case LLVMContext::MD_range:
- if (auto *LDest = dyn_cast<LoadInst>(&DestI)) {
- if (LSource && DL)
- copyRangeMetadata(*DL, *LSource, N, *LDest);
- }
- break;
-
- case LLVMContext::MD_tbaa:
- if (isa<LoadInst>(DestI))
- DestI.setMetadata(ID, N);
+ copyRangeMetadata(DL, Source, N, Dest);
break;
}
}
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 114df653bad83..c28314f6ab124 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -112,7 +112,6 @@
#include <optional>
#include <tuple>
#include <type_traits>
-#include <unordered_map>
#include <utility>
#include <vector>
@@ -269,6 +268,11 @@ class Vectorizer {
/// isGuaranteedToTransferExecutionToSuccessor(I) == true.
bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
+ /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
+ /// in the same BB with the same value for getUnderlyingObject() etc.
+ bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
+ ArrayRef<Instruction *> EqClass);
+
/// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
/// where all instructions access a known, constant offset from the first
/// instruction.
@@ -334,22 +338,12 @@ class Vectorizer {
EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
BasicBlock::iterator End);
- /// Inserts a cast instruction to convert Inst to DstTy.
- Value *insertCast(Value *Val, Type *DstTy);
-
/// Partitions Instrs into "chains" where every instruction has a known
/// constant offset from the first instr in the chain.
///
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
/// in the chain is the leader, and an instr touches distance 0 from itself.
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
-
- // Helpers for chain merging.
- std::optional<APInt> computeLeaderDelta(Instruction *I1, Instruction *I2);
- bool chainsOverlapAfterRebase(const Chain &A, const Chain &B,
- const APInt &Delta) const;
- static void rebaseChain(Chain &C, const APInt &Delta);
- void normalizeChainToType(Chain &C, Type *CastTy);
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -431,20 +425,6 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
return Changed ? PA : PreservedAnalyses::all();
}
-static const Value *getUnderlyingObject(const Value *Ptr) {
- const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
- if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
- // The select's themselves are distinct instructions even if they share
- // the same condition and evaluate to consecutive pointers for true and
- // false values of the condition. Therefore using the select's themselves
- // for grouping instructions would put consecutive accesses into different
- // lists and they won't be even checked for being consecutive, and won't
- // be vectorized.
- return Sel->getCondition();
- }
- return ObjPtr;
-}
-
bool Vectorizer::run() {
bool Changed = false;
// Break up the BB if there are any instrs which aren't guaranteed to transfer
@@ -488,88 +468,6 @@ bool Vectorizer::run() {
return Changed;
}
-Value *Vectorizer::insertCast(Value *Val, Type *DstTy) {
- if (DL.getTypeSizeInBits(Val->getType()) == DL.getTypeSizeInBits(DstTy)) {
- return Builder.CreateBitOrPointerCast(Val, DstTy, Val->getName() + ".bc");
- }
-
- // If the types are of different sizes and both are integers, we can use
- // zext or sext to cast.
- if (Val->getType()->isIntegerTy() && DstTy->isIntegerTy()) {
- if (DL.getTypeSizeInBits(Val->getType()) < DL.getTypeSizeInBits(DstTy)) {
- return Builder.CreateZExt(Val, DstTy, Val->getName() + ".bc");
- }
- return Builder.CreateTrunc(Val, DstTy, Val->getName() + ".bc");
- }
-
- return nullptr;
-}
-
-std::optional<APInt> Vectorizer::computeLeaderDelta(Instruction *I1,
- Instruction *I2) {
- assert(((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
- (isa<StoreInst>(I1) && isa<StoreInst>(I2))) &&
- "computeLeaderDelta must be called with two load or two store "
- "instructions");
- Instruction *CtxInst = I1->comesBefore(I2) ? I2 : I1;
- const Value *Ptr1 = getLoadStorePointerOperand(I1);
- const Value *Ptr2 = getLoadStorePointerOperand(I2);
- return getConstantOffset(const_cast<Value *>(Ptr1), const_cast<Value *>(Ptr2),
- CtxInst);
-}
-
-bool Vectorizer::chainsOverlapAfterRebase(const Chain &A, const Chain &B,
- const APInt &Delta) const {
- ConstantRange ARange(
- A.front().OffsetFromLeader,
- A.back().OffsetFromLeader +
- DL.getTypeStoreSize(getLoadStoreType(A.back().Inst)));
- ConstantRange BRange(
- B.front().OffsetFromLeader + Delta,
- B.back().OffsetFromLeader + Delta +
- DL.getTypeStoreSize(getLoadStoreType(B.back().Inst)));
- return !ARange.intersectWith(BRange).isEmptySet();
-}
-
-void Vectorizer::rebaseChain(Chain &C, const APInt &Delta) {
- for (ChainElem &E : C)
- E.OffsetFromLeader += Delta;
-}
-
-void Vectorizer::normalizeChainToType(Chain &C, Type *CastTy) {
- for (ChainElem &Elem : C) {
- Instruction *Inst = Elem.Inst;
- Type *OrigValTy = getLoadStoreType(Inst);
- if (OrigValTy == CastTy)
- continue;
-
- if (auto *LI = dyn_cast<LoadInst>(Inst)) {
- Builder.SetInsertPoint(LI);
- LoadInst *NewLoad = Builder.CreateLoad(CastTy, LI->getPointerOperand(),
- LI->getName() + ".mut");
- copyMetadataForAccess(*NewLoad, *LI);
- Value *CastBack = insertCast(NewLoad, OrigValTy);
- if (!CastBack)
- llvm_unreachable("Failed to insert cast");
- LI->replaceAllUsesWith(CastBack);
- ToErase.emplace_back(LI);
- Elem.Inst = NewLoad;
- } else if (auto *SI = dyn_cast<StoreInst>(Inst)) {
- Builder.SetInsertPoint(SI);
- Value *CastVal = insertCast(SI->getValueOperand(), CastTy);
- if (!CastVal)
- llvm_unreachable("Failed to insert cast");
- StoreInst *NewStore =
- Builder.CreateStore(CastVal, SI->getPointerOperand());
- NewStore->setAlignment(SI->getAlign());
- NewStore->setVolatile(SI->isVolatile());
- copyMetadataForAccess(*NewStore, *SI);
- ToErase.emplace_back(SI);
- Elem.Inst = NewStore;
- }
- }
-}
-
bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
BasicBlock::iterator End) {
LLVM_DEBUG({
@@ -582,120 +480,49 @@ bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
});
bool Changed = false;
- SmallVector<Chain> ContiguousSubChains;
-
for (const auto &[EqClassKey, EqClass] :
- collectEquivalenceClasses(Begin, End)) {
-
- LLVM_DEBUG({
- dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
- << " keyed on " << EqClassKey << ":\n";
- for (Instruction *I : EqClass)
- dbgs() << " " << *I << "\n";
- });
-
- for (Chain &C : gatherChains(EqClass)) {
-
- // Split up the chain into increasingly smaller chains, until we can
- // finally vectorize the chains.
- //
- // (Don't be scared by the depth of the loop nest here. These operations
- // are all at worst O(n lg n) in the number of instructions, and splitting
- // chains doesn't change the number of instrs. So the whole loop nest is
- // O(n lg n).)
- for (auto &C : splitChainByMayAliasInstrs(C)) {
- for (auto &C : splitChainByContiguity(C)) {
- ContiguousSubChains.emplace_back(C);
- }
- }
- }
- }
-
- // Merge chains in reverse order, so that the first chain is the largest.
- for (int I = ContiguousSubChains.size() - 1; I > 0; I--) {
- Chain &C1 = ContiguousSubChains[I - 1];
- Chain &C2 = ContiguousSubChains[I];
+ collectEquivalenceClasses(Begin, End))
+ Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
- // If the scalar types of the chains are the same, we can merge them
- // without inserting any casts.
- if (getLoadStoreType(C1[0].Inst)->getScalarType() ==
- getLoadStoreType(C2[0].Inst)->getScalarType())
- continue;
-
- const Value *C1Ptr = getLoadStorePointerOperand(C1[0].Inst);
- const Value *C2Ptr = getLoadStorePointerOperand(C2[0].Inst);
- unsigned AS1 = C1Ptr->getType()->getPointerAddressSpace();
- unsigned AS2 = C2Ptr->getType()->getPointerAddressSpace();
- bool C1IsLoad = isa<LoadInst>(C1[0].Inst);
- bool C2IsLoad = isa<LoadInst>(C2[0].Inst);
-
- // If the chains are mapped to different types, have distinct underlying
- // pointer objects, or include both loads and stores, skip.
- if (C1IsLoad != C2IsLoad || AS1 != AS2 ||
- ::getUnderlyingObject(C1Ptr) != ::getUnderlyingObject(C2Ptr))
- continue;
-
- // Compute constant offset between chain leaders; if unknown, skip.
- std::optional<APInt> DeltaOpt = computeLeaderDelta(C1[0].Inst, C2[0].Inst);
- if (!DeltaOpt)
- continue;
-
- // Check that rebasing C2 into C1's coordinate space will not overlap C1.
- if (chainsOverlapAfterRebase(C1, C2, *DeltaOpt))
- continue;
-
- // Determine the common integer cast type for normalization and ensure total
- // bitwidth matches across all elements of both chains.
- Type *C1ElemTy = getLoadStoreType(C1[0].Inst);
- unsigned TotalBits = DL.getTypeSizeInBits(C1ElemTy);
- auto AllElemsMatchTotalBits = [&](const Chain &C) {
- return llvm::all_of(C, [&](const ChainElem &E) {
- return DL.getTypeSizeInBits(getLoadStoreType(E.Inst)) == TotalBits;
- });
- };
- if (!AllElemsMatchTotalBits(C1) || !AllElemsMatchTotalBits(C2))
- continue;
+ return Changed;
+}
- // Power-of-two span ensures we can form a legal, single vector access
- // without padding or splitting. Many targets and cost models assume POT
- // widths, and it guarantees an integral element count for the chosen
- // VecElemTy.
- APInt Sz = C2.front().OffsetFromLeader +
- DL.getTypeStoreSize(getLoadStoreType(C2.front().Inst)) -
- C1.back().OffsetFromLeader + *DeltaOpt;
- if (!Sz.isPowerOf2())
- continue;
+bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
+ ArrayRef<Instruction *> EqClass) {
+ bool Changed = false;
- // Rebase C2's offsets into C1's coordinate space prior to merging and
- // merge C2 into C1 by appending all elements of C2 to C1, then erase C2
- // from ContiguousSubChains.
- rebaseChain(C2, *DeltaOpt);
- C1.insert(C1.end(), C2.begin(), C2.end());
- ContiguousSubChains.erase(ContiguousSubChains.begin() + I);
-
- // Normalize the value operand/result type of each instruction in C1 to
- // C1CastTy.
- Type *C1CastTy =
- Type::getIntNTy(C1ElemTy->getContext(), DL.getTypeSizeInBits(C1ElemTy));
- normalizeChainToType(C1, C1CastTy);
- }
+ LLVM_DEBUG({
+ dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
+ << " keyed on " << EqClassKey << ":\n";
+ for (Instruction *I : EqClass)
+ dbgs() << " " << *I << "\n";
+ });
- for (auto &C : ContiguousSubChains) {
- if (C.size() <= 1)
- continue;
- for (auto &AlignedSubChain : splitChainByAlignment(C))
- Changed |= vectorizeChain(AlignedSubChain);
- }
+ std::vector<Chain> Chains = gatherChains(EqClass);
+ LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
+ << " nontrivial chains.\n";);
+ for (Chain &C : Chains)
+ Changed |= runOnChain(C);
+ return Changed;
+}
- // Erase all instructions scheduled for deletion in this pseudo-BB.
- for (Instruction *I : ToErase) {
- auto *PtrOperand = getLoadStorePointerOperand(I);
- if (I->use_empty())
- I->eraseFromParent();
- RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
- }
- ToErase.clear();
+bool Vectorizer::runOnChain(Chain &C) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
+ dumpChain(C);
+ });
+ // Split up the chain into increasingly smaller chains, until we can finally
+ // vectorize the chains.
+ //
+ // (Don't be scared by the depth of the loop nest here. These operations are
+ // all at worst O(n lg n) in the number of instructions, and splitting chains
+ // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
+ bool Changed = false;
+ for (auto &C : splitChainByMayAliasInstrs(C))
+ for (auto &C : splitChainByContiguity(C))
+ for (auto &C : splitChainByAlignment(C))
+ Changed |= vectorizeChain(C);
return Changed;
}
@@ -756,7 +583,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
LLVM_DEBUG(
dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
<< *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
- if (!NewChain.empty()) {
+ if (NewChain.size() > 1) {
LLVM_DEBUG({
dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
dumpChain(NewChain);
@@ -768,7 +595,7 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
NewChain = SmallVector<ChainElem, 1>({*ChainIt});
}
}
- if (!NewChain.empty()) {
+ if (NewChain.size() > 1) {
LLVM_DEBUG({
dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
dumpChain(NewChain);
@@ -833,6 +660,8 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
}
+ // Filter out length-1 chains, these are uninteresting.
+ llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
return Ret;
}
@@ -852,7 +681,7 @@ Type *Vectorizer::getChainElemTy(const Chain &C) {
if (any_of(C, [](const ChainElem &E) {
return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
})) {
- return IntegerType::getIntNTy(
+ return Type::getIntNTy(
F.getContext(),
DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
}
@@ -1640,6 +1469,20 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
BasicBlock::iterator End) {
EquivalenceClassMap Ret;
+ auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
+ const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
+ if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+ // The select's themselves are distinct instructions even if they share
+ // the same condition and evaluate to consecutive pointers for true and
+ // false values of the condition. Therefore using the select's themselves
+ // for grouping instructions would put consecutive accesses into different
+ // lists and they won't be even checked for being consecutive, and won't
+ // be vectorized.
+ return Sel->getCondition();
+ }
+ return ObjPtr;
+ };
+
for (Instruction &I : make_range(Begin, End)) {
auto *LI = dyn_cast<LoadInst>(&I);
auto *SI = dyn_cast<StoreInst>(&I);
@@ -1687,7 +1530,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
(VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
- Ret[{::getUnderlyingObject(Ptr), AS,
+ Ret[{GetUnderlyingObject(Ptr), AS,
DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
/*IsLoad=*/LI != nullptr}]
.emplace_back(&I);
@@ -1782,7 +1625,8 @@ std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
Ret.reserve(Chains.size());
// Iterate over MRU rather than Chains so the order is deterministic.
for (auto &E : MRU)
- Ret.emplace_back(std::move(E.second));
+ if (E.second.size() > 1)
+ Ret.emplace_back(std::move(E.second));
return Ret;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
index 7dd907e3c143f..fc236147f1238 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
@@ -20,5 +20,3 @@ define void @addi32(i32 %arg1, i32 %arg2) {
store i32 %res, ptr addrspace(1) poison
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index a58766270561b..ebbeab94066d6 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -510,55 +510,53 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-LABEL: introduced_copy_to_sgpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc
-; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10
+; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18
+; GFX908-NEXT: s_mov_b32 s12, 0
+; GFX908-NEXT: s_mov_b32 s9, s12
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
-; GFX908-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
-; GFX908-NEXT: s_load_dword s5, s[8:9], 0x18
-; GFX908-NEXT: s_mov_b32 s4, 0
-; GFX908-NEXT: s_mov_b32 s9, s4
-; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1
-; GFX908-NEXT: s_sub_i32 s8, 0, s1
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s5
+; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX908-NEXT: s_sub_i32 s1, 0, s7
+; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s0
; GFX908-NEXT: v_mov_b32_e32 v17, 0
; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX908-NEXT: v_readfirstlane_b32 s10, v0
-; GFX908-NEXT: s_mul_i32 s8, s8, s10
-; GFX908-NEXT: s_mul_hi_u32 s8, s10, s8
-; GFX908-NEXT: s_add_i32 s10, s10, s8
-; GFX908-NEXT: s_mul_hi_u32 s8, s0, s10
-; GFX908-NEXT: s_mul_i32 s10, s8, s1
-; GFX908-NEXT: s_sub_i32 s0, s0, s10
-; GFX908-NEXT: s_add_i32 s11, s8, 1
-; GFX908-NEXT: s_sub_i32 s10, s0, s1
-; GFX908-NEXT: s_cmp_ge_u32 s0, s1
-; GFX908-NEXT: s_cselect_b32 s8, s11, s8
-; GFX908-NEXT: s_cselect_b32 s0, s10, s0
-; GFX908-NEXT: s_add_i32 s10, s8, 1
-; GFX908-NEXT: s_cmp_ge_u32 s0, s1
-; GFX908-NEXT: s_cselect_b32 s8, s10, s8
-; GFX908-NEXT: s_lshr_b32 s5, s5, 16
-; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s5
-; GFX908-NEXT: s_lshl_b64 s[10:11], s[2:3], 5
-; GFX908-NEXT: s_lshl_b64 s[14:15], s[8:9], 5
-; GFX908-NEXT: s_lshl_b64 s[12:13], s[6:7], 5
+; GFX908-NEXT: v_readfirstlane_b32 s2, v0
+; GFX908-NEXT: s_mul_i32 s1, s1, s2
+; GFX908-NEXT: s_mul_hi_u32 s1, s2, s1
+; GFX908-NEXT: s_add_i32 s2, s2, s1
+; GFX908-NEXT: s_mul_hi_u32 s1, s6, s2
+; GFX908-NEXT: s_mul_i32 s2, s1, s7
+; GFX908-NEXT: s_sub_i32 s2, s6, s2
+; GFX908-NEXT: s_add_i32 s3, s1, 1
+; GFX908-NEXT: s_sub_i32 s6, s2, s7
+; GFX908-NEXT: s_cmp_ge_u32 s2, s7
+; GFX908-NEXT: s_cselect_b32 s1, s3, s1
+; GFX908-NEXT: s_cselect_b32 s2, s6, s2
+; GFX908-NEXT: s_add_i32 s3, s1, 1
+; GFX908-NEXT: s_cmp_ge_u32 s2, s7
+; GFX908-NEXT: s_cselect_b32 s8, s3, s1
+; GFX908-NEXT: s_lshr_b32 s2, s0, 16
+; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2
+; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
+; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
; GFX908-NEXT: v_mov_b32_e32 v0, 0
; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1]
-; GFX908-NEXT: s_or_b32 s12, s12, 28
+; GFX908-NEXT: s_or_b32 s14, s14, 28
+; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
; GFX908-NEXT: v_mov_b32_e32 v1, 0
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_readfirstlane_b32 s5, v16
-; GFX908-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX908-NEXT: s_mul_i32 s3, s3, s5
-; GFX908-NEXT: s_mul_hi_u32 s9, s2, s5
-; GFX908-NEXT: s_mul_i32 s2, s2, s5
-; GFX908-NEXT: s_add_i32 s3, s9, s3
-; GFX908-NEXT: s_lshl_b64 s[16:17], s[2:3], 5
+; GFX908-NEXT: v_readfirstlane_b32 s2, v16
+; GFX908-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX908-NEXT: s_mul_i32 s3, s5, s2
+; GFX908-NEXT: s_mul_hi_u32 s5, s4, s2
+; GFX908-NEXT: s_mul_i32 s2, s4, s2
+; GFX908-NEXT: s_add_i32 s3, s5, s3
+; GFX908-NEXT: s_lshl_b64 s[4:5], s[2:3], 5
; GFX908-NEXT: s_branch .LBB3_2
-; GFX908-NEXT: .LBB3_1: ; %Flow21
+; GFX908-NEXT: .LBB3_1: ; %Flow20
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX908-NEXT: s_cbranch_vccz .LBB3_12
@@ -571,47 +569,47 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: ; %bb.3: ; %bb14
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
-; GFX908-NEXT: s_mov_b32 s5, s4
+; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
+; GFX908-NEXT: s_mov_b32 s13, s12
; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX908-NEXT: v_mov_b32_e32 v4, s4
+; GFX908-NEXT: v_mov_b32_e32 v4, s12
; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6
-; GFX908-NEXT: v_mov_b32_e32 v7, s5
-; GFX908-NEXT: v_mov_b32_e32 v9, s5
-; GFX908-NEXT: v_mov_b32_e32 v5, s5
-; GFX908-NEXT: v_mov_b32_e32 v6, s4
-; GFX908-NEXT: v_mov_b32_e32 v8, s4
-; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[6:7], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s12
+; GFX908-NEXT: v_mov_b32_e32 v8, s12
+; GFX908-NEXT: v_mov_b32_e32 v5, s13
+; GFX908-NEXT: v_mov_b32_e32 v7, s13
+; GFX908-NEXT: v_mov_b32_e32 v9, s13
+; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
; GFX908-NEXT: v_mov_b32_e32 v11, v5
-; GFX908-NEXT: s_mov_b64 s[20:21], s[12:13]
+; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15]
; GFX908-NEXT: v_mov_b32_e32 v10, v4
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_readfirstlane_b32 s5, v2
-; GFX908-NEXT: v_readfirstlane_b32 s9, v3
-; GFX908-NEXT: s_add_u32 s5, s5, 1
-; GFX908-NEXT: s_addc_u32 s9, s9, 0
-; GFX908-NEXT: s_mul_hi_u32 s22, s10, s5
-; GFX908-NEXT: s_mul_i32 s9, s10, s9
-; GFX908-NEXT: s_mul_i32 s23, s11, s5
-; GFX908-NEXT: s_add_i32 s9, s22, s9
-; GFX908-NEXT: s_mul_i32 s5, s10, s5
-; GFX908-NEXT: s_add_i32 s9, s9, s23
+; GFX908-NEXT: v_readfirstlane_b32 s9, v2
+; GFX908-NEXT: v_readfirstlane_b32 s13, v3
+; GFX908-NEXT: s_add_u32 s9, s9, 1
+; GFX908-NEXT: s_addc_u32 s13, s13, 0
+; GFX908-NEXT: s_mul_hi_u32 s22, s6, s9
+; GFX908-NEXT: s_mul_i32 s13, s6, s13
+; GFX908-NEXT: s_mul_i32 s23, s7, s9
+; GFX908-NEXT: s_add_i32 s13, s22, s13
+; GFX908-NEXT: s_mul_i32 s9, s6, s9
+; GFX908-NEXT: s_add_i32 s13, s13, s23
; GFX908-NEXT: s_branch .LBB3_5
; GFX908-NEXT: .LBB3_4: ; %bb58
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT: s_add_u32 s20, s20, s16
+; GFX908-NEXT: s_add_u32 s20, s20, s4
; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
-; GFX908-NEXT: s_addc_u32 s21, s21, s17
+; GFX908-NEXT: s_addc_u32 s21, s21, s5
; GFX908-NEXT: s_mov_b64 s[22:23], 0
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
; GFX908-NEXT: .LBB3_5: ; %bb16
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT: s_add_u32 s22, s20, s5
-; GFX908-NEXT: s_addc_u32 s23, s21, s9
+; GFX908-NEXT: s_add_u32 s22, s20, s9
+; GFX908-NEXT: s_addc_u32 s23, s21, s13
; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc
@@ -659,17 +657,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1
-; GFX908-NEXT: .LBB3_10: ; %Flow20
+; GFX908-NEXT: .LBB3_10: ; %Flow19
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: s_mov_b64 s[2:3], -1
; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19]
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
; GFX908-NEXT: ; %bb.11: ; %bb12
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT: s_add_u32 s6, s6, s8
-; GFX908-NEXT: s_addc_u32 s7, s7, 0
-; GFX908-NEXT: s_add_u32 s12, s12, s14
-; GFX908-NEXT: s_addc_u32 s13, s13, s15
+; GFX908-NEXT: s_add_u32 s10, s10, s8
+; GFX908-NEXT: s_addc_u32 s11, s11, 0
+; GFX908-NEXT: s_add_u32 s14, s14, s16
+; GFX908-NEXT: s_addc_u32 s15, s15, s17
; GFX908-NEXT: s_mov_b64 s[2:3], 0
; GFX908-NEXT: s_branch .LBB3_1
; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock
@@ -678,54 +676,52 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-LABEL: introduced_copy_to_sgpr:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10
+; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18
+; GFX90A-NEXT: s_mov_b32 s12, 0
+; GFX90A-NEXT: s_mov_b32 s9, s12
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
-; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
-; GFX90A-NEXT: s_load_dword s5, s[8:9], 0x18
-; GFX90A-NEXT: s_mov_b32 s4, 0
-; GFX90A-NEXT: s_mov_b32 s9, s4
-; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1
-; GFX90A-NEXT: s_sub_i32 s8, 0, s1
+; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX90A-NEXT: s_sub_i32 s1, 0, s7
; GFX90A-NEXT: v_mov_b32_e32 v19, 0
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s5
-; GFX90A-NEXT: v_readfirstlane_b32 s10, v1
-; GFX90A-NEXT: s_mul_i32 s8, s8, s10
-; GFX90A-NEXT: s_mul_hi_u32 s8, s10, s8
-; GFX90A-NEXT: s_add_i32 s10, s10, s8
-; GFX90A-NEXT: s_mul_hi_u32 s8, s0, s10
-; GFX90A-NEXT: s_mul_i32 s10, s8, s1
-; GFX90A-NEXT: s_sub_i32 s0, s0, s10
-; GFX90A-NEXT: s_add_i32 s11, s8, 1
-; GFX90A-NEXT: s_sub_i32 s10, s0, s1
-; GFX90A-NEXT: s_cmp_ge_u32 s0, s1
-; GFX90A-NEXT: s_cselect_b32 s8, s11, s8
-; GFX90A-NEXT: s_cselect_b32 s0, s10, s0
-; GFX90A-NEXT: s_add_i32 s10, s8, 1
-; GFX90A-NEXT: s_cmp_ge_u32 s0, s1
-; GFX90A-NEXT: s_cselect_b32 s8, s10, s8
-; GFX90A-NEXT: s_lshr_b32 s5, s5, 16
-; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s5
-; GFX90A-NEXT: s_lshl_b64 s[10:11], s[2:3], 5
-; GFX90A-NEXT: s_lshl_b64 s[14:15], s[8:9], 5
-; GFX90A-NEXT: s_lshl_b64 s[12:13], s[6:7], 5
+; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
+; GFX90A-NEXT: s_mul_i32 s1, s1, s2
+; GFX90A-NEXT: s_mul_hi_u32 s1, s2, s1
+; GFX90A-NEXT: s_add_i32 s2, s2, s1
+; GFX90A-NEXT: s_mul_hi_u32 s1, s6, s2
+; GFX90A-NEXT: s_mul_i32 s2, s1, s7
+; GFX90A-NEXT: s_sub_i32 s2, s6, s2
+; GFX90A-NEXT: s_add_i32 s3, s1, 1
+; GFX90A-NEXT: s_sub_i32 s6, s2, s7
+; GFX90A-NEXT: s_cmp_ge_u32 s2, s7
+; GFX90A-NEXT: s_cselect_b32 s1, s3, s1
+; GFX90A-NEXT: s_cselect_b32 s2, s6, s2
+; GFX90A-NEXT: s_add_i32 s3, s1, 1
+; GFX90A-NEXT: s_cmp_ge_u32 s2, s7
+; GFX90A-NEXT: s_cselect_b32 s8, s3, s1
+; GFX90A-NEXT: s_lshr_b32 s2, s0, 16
+; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
+; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1]
-; GFX90A-NEXT: s_or_b32 s12, s12, 28
+; GFX90A-NEXT: s_or_b32 s14, s14, 28
+; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_readfirstlane_b32 s5, v18
-; GFX90A-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX90A-NEXT: s_mul_i32 s3, s3, s5
-; GFX90A-NEXT: s_mul_hi_u32 s9, s2, s5
-; GFX90A-NEXT: s_mul_i32 s2, s2, s5
-; GFX90A-NEXT: s_add_i32 s3, s9, s3
-; GFX90A-NEXT: s_lshl_b64 s[16:17], s[2:3], 5
+; GFX90A-NEXT: v_readfirstlane_b32 s2, v18
+; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX90A-NEXT: s_mul_i32 s3, s5, s2
+; GFX90A-NEXT: s_mul_hi_u32 s5, s4, s2
+; GFX90A-NEXT: s_mul_i32 s2, s4, s2
+; GFX90A-NEXT: s_add_i32 s3, s5, s3
+; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], 5
; GFX90A-NEXT: s_branch .LBB3_2
-; GFX90A-NEXT: .LBB3_1: ; %Flow21
+; GFX90A-NEXT: .LBB3_1: ; %Flow20
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
@@ -738,34 +734,34 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: ; %bb.3: ; %bb14
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
-; GFX90A-NEXT: s_mov_b32 s5, s4
+; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
+; GFX90A-NEXT: s_mov_b32 s13, s12
; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3]
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[6:7], 0
-; GFX90A-NEXT: s_mov_b64 s[20:21], s[12:13]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
+; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15]
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_readfirstlane_b32 s5, v4
-; GFX90A-NEXT: v_readfirstlane_b32 s9, v5
-; GFX90A-NEXT: s_add_u32 s5, s5, 1
-; GFX90A-NEXT: s_addc_u32 s9, s9, 0
-; GFX90A-NEXT: s_mul_hi_u32 s22, s10, s5
-; GFX90A-NEXT: s_mul_i32 s9, s10, s9
-; GFX90A-NEXT: s_mul_i32 s23, s11, s5
-; GFX90A-NEXT: s_add_i32 s9, s22, s9
-; GFX90A-NEXT: s_mul_i32 s5, s10, s5
-; GFX90A-NEXT: s_add_i32 s9, s9, s23
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v4
+; GFX90A-NEXT: v_readfirstlane_b32 s13, v5
+; GFX90A-NEXT: s_add_u32 s9, s9, 1
+; GFX90A-NEXT: s_addc_u32 s13, s13, 0
+; GFX90A-NEXT: s_mul_hi_u32 s22, s6, s9
+; GFX90A-NEXT: s_mul_i32 s13, s6, s13
+; GFX90A-NEXT: s_mul_i32 s23, s7, s9
+; GFX90A-NEXT: s_add_i32 s13, s22, s13
+; GFX90A-NEXT: s_mul_i32 s9, s6, s9
+; GFX90A-NEXT: s_add_i32 s13, s13, s23
; GFX90A-NEXT: s_branch .LBB3_5
; GFX90A-NEXT: .LBB3_4: ; %bb58
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: s_add_u32 s20, s20, s16
-; GFX90A-NEXT: s_addc_u32 s21, s21, s17
+; GFX90A-NEXT: s_add_u32 s20, s20, s4
+; GFX90A-NEXT: s_addc_u32 s21, s21, s5
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
; GFX90A-NEXT: s_mov_b64 s[22:23], 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
@@ -773,8 +769,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: .LBB3_5: ; %bb16
; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT: s_add_u32 s22, s20, s5
-; GFX90A-NEXT: s_addc_u32 s23, s21, s9
+; GFX90A-NEXT: s_add_u32 s22, s20, s9
+; GFX90A-NEXT: s_addc_u32 s23, s21, s13
; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc
@@ -815,17 +811,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1
-; GFX90A-NEXT: .LBB3_10: ; %Flow20
+; GFX90A-NEXT: .LBB3_10: ; %Flow19
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: s_mov_b64 s[2:3], -1
; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
; GFX90A-NEXT: ; %bb.11: ; %bb12
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT: s_add_u32 s6, s6, s8
-; GFX90A-NEXT: s_addc_u32 s7, s7, 0
-; GFX90A-NEXT: s_add_u32 s12, s12, s14
-; GFX90A-NEXT: s_addc_u32 s13, s13, s15
+; GFX90A-NEXT: s_add_u32 s10, s10, s8
+; GFX90A-NEXT: s_addc_u32 s11, s11, 0
+; GFX90A-NEXT: s_add_u32 s14, s14, s16
+; GFX90A-NEXT: s_addc_u32 s15, s15, s17
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
; GFX90A-NEXT: s_branch .LBB3_1
; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 08ce28c12118b..df77e7de43bf6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2543,45 +2543,44 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
;
; GFX6-LABEL: udiv_v4i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s0, s8, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0
-; GFX6-NEXT: s_mov_b32 s0, s4
-; GFX6-NEXT: s_and_b32 s1, s6, 0xffff
+; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5
+; GFX6-NEXT: s_lshr_b32 s5, s10, 16
+; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5
+; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
; GFX6-NEXT: s_lshr_b32 s4, s8, 16
-; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4
-; GFX6-NEXT: s_lshr_b32 s4, s6, 16
; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3
-; GFX6-NEXT: v_trunc_f32_e32 v2, v2
-; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT: v_trunc_f32_e32 v3, v3
+; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
-; GFX6-NEXT: s_and_b32 s4, s9, 0xffff
-; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
-; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4
+; GFX6-NEXT: s_and_b32 s4, s11, 0xffff
+; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4
; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT: s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT: s_and_b32 s4, s9, 0xffff
+; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc
; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
-; GFX6-NEXT: s_lshr_b32 s4, s9, 16
+; GFX6-NEXT: s_lshr_b32 s4, s11, 16
; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5
; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT: s_lshr_b32 s4, s7, 16
+; GFX6-NEXT: s_lshr_b32 s4, s9, 16
; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5
@@ -2597,7 +2596,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: s_mov_b32 s1, s5
; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2605,43 +2603,42 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
;
; GFX9-LABEL: udiv_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
-; GFX9-NEXT: s_and_b32 s5, s6, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
-; GFX9-NEXT: s_lshr_b32 s6, s6, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: s_and_b32 s7, s2, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT: s_and_b32 s6, s0, 0xffff
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: s_and_b32 s2, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s3, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT: s_and_b32 s2, s3, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s1, 0xffff
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX9-NEXT: s_lshr_b32 s2, s7, 16
+; GFX9-NEXT: s_lshr_b32 s0, s3, 16
; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_lshr_b32 s2, s3, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2
+; GFX9-NEXT: s_lshr_b32 s0, s1, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -2649,6 +2646,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
@@ -2657,7 +2655,8 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
%r = udiv <4 x i16> %x, %y
store <4 x i16> %r, ptr addrspace(1) %out
@@ -2759,51 +2758,49 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
;
; GFX6-LABEL: urem_v4i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s0, s8, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0
-; GFX6-NEXT: s_mov_b32 s0, s4
-; GFX6-NEXT: s_and_b32 s1, s6, 0xffff
+; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5
+; GFX6-NEXT: s_lshr_b32 s5, s10, 16
+; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5
+; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
; GFX6-NEXT: s_lshr_b32 s4, s8, 16
-; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4
-; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_lshr_b32 s5, s6, 16
-; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
-; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3
-; GFX6-NEXT: v_trunc_f32_e32 v2, v2
-; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT: v_trunc_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
-; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
-; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1
-; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4
-; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3
-; GFX6-NEXT: s_and_b32 s8, s9, 0xffff
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8
-; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4
-; GFX6-NEXT: s_and_b32 s4, s7, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1
+; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4
+; GFX6-NEXT: s_and_b32 s6, s11, 0xffff
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5
+; GFX6-NEXT: s_and_b32 s5, s9, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5
; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
-; GFX6-NEXT: s_lshr_b32 s4, s9, 16
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v1
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v1
+; GFX6-NEXT: s_lshr_b32 s4, s11, 16
; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4
; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT: s_lshr_b32 s5, s7, 16
+; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10
+; GFX6-NEXT: s_lshr_b32 s5, s9, 16
; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5
-; GFX6-NEXT: v_trunc_f32_e32 v1, v1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
+; GFX6-NEXT: v_trunc_f32_e32 v1, v1
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
@@ -2814,10 +2811,10 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11
; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s7, v1
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -2829,67 +2826,67 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
;
; GFX9-LABEL: urem_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
-; GFX9-NEXT: s_and_b32 s5, s6, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4
-; GFX9-NEXT: s_lshr_b32 s6, s6, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: s_and_b32 s9, s2, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
+; GFX9-NEXT: s_and_b32 s8, s0, 0xffff
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
+; GFX9-NEXT: s_and_b32 s4, s3, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
-; GFX9-NEXT: s_and_b32 s5, s7, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s5
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: s_and_b32 s8, s3, 0xffff
+; GFX9-NEXT: s_and_b32 s5, s1, 0xffff
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7
-; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6
+; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
-; GFX9-NEXT: s_lshr_b32 s6, s7, 16
+; GFX9-NEXT: s_lshr_b32 s2, s3, 16
; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6
-; GFX9-NEXT: s_lshr_b32 s3, s3, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3
; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6
-; GFX9-NEXT: v_sub_u32_e32 v4, s2, v1
-; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2
+; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2
+; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0
+; GFX9-NEXT: v_sub_u32_e32 v4, s0, v1
+; GFX9-NEXT: v_sub_u32_e32 v1, s5, v2
+; GFX9-NEXT: v_sub_u32_e32 v2, s1, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, s3, v3
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
%r = urem <4 x i16> %x, %y
store <4 x i16> %r, ptr addrspace(1) %out
@@ -2999,64 +2996,62 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
;
; GFX6-LABEL: sdiv_v4i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_sext_i32_i16 s7, s10
-; GFX6-NEXT: s_sext_i32_i16 s6, s4
-; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6
-; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s7
-; GFX6-NEXT: s_xor_b32 s6, s7, s6
-; GFX6-NEXT: s_ashr_i32 s6, s6, 30
+; GFX6-NEXT: s_sext_i32_i16 s4, s10
+; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX6-NEXT: s_sext_i32_i16 s5, s8
+; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
+; GFX6-NEXT: s_xor_b32 s4, s5, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT: s_mov_b32 s0, s8
-; GFX6-NEXT: s_or_b32 s8, s6, 1
-; GFX6-NEXT: s_mov_b32 s1, s9
+; GFX6-NEXT: s_ashr_i32 s4, s4, 30
+; GFX6-NEXT: s_or_b32 s6, s4, 1
; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX6-NEXT: v_trunc_f32_e32 v2, v2
; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
-; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT: s_cselect_b32 s6, s8, 0
-; GFX6-NEXT: s_ashr_i32 s4, s4, 16
-; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v2
-; GFX6-NEXT: s_ashr_i32 s6, s10, 16
-; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6
+; GFX6-NEXT: s_cselect_b32 s4, s6, 0
+; GFX6-NEXT: s_ashr_i32 s5, s10, 16
+; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v2
+; GFX6-NEXT: s_ashr_i32 s4, s8, 16
+; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT: s_xor_b32 s4, s6, s4
+; GFX6-NEXT: s_xor_b32 s4, s4, s5
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
-; GFX6-NEXT: s_or_b32 s4, s4, 1
+; GFX6-NEXT: s_or_b32 s6, s4, 1
; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
; GFX6-NEXT: v_trunc_f32_e32 v3, v3
; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
-; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
-; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT: s_sext_i32_i16 s6, s5
-; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6
-; GFX6-NEXT: s_cselect_b32 s4, s4, 0
+; GFX6-NEXT: s_sext_i32_i16 s5, s11
+; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5
+; GFX6-NEXT: s_cselect_b32 s4, s6, 0
; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3
-; GFX6-NEXT: s_sext_i32_i16 s4, s11
+; GFX6-NEXT: s_sext_i32_i16 s4, s9
; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0
-; GFX6-NEXT: s_xor_b32 s4, s4, s6
+; GFX6-NEXT: s_xor_b32 s4, s4, s5
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
-; GFX6-NEXT: s_or_b32 s4, s4, 1
+; GFX6-NEXT: s_or_b32 s6, s4, 1
; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4
; GFX6-NEXT: v_trunc_f32_e32 v4, v4
; GFX6-NEXT: v_mad_f32 v1, -v4, v0, v1
-; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
-; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4
-; GFX6-NEXT: s_cselect_b32 s4, s4, 0
-; GFX6-NEXT: s_ashr_i32 s5, s5, 16
+; GFX6-NEXT: s_cselect_b32 s4, s6, 0
+; GFX6-NEXT: s_ashr_i32 s5, s11, 16
; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5
; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4
-; GFX6-NEXT: s_ashr_i32 s4, s11, 16
+; GFX6-NEXT: s_ashr_i32 s4, s9, 16
; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v0
; GFX6-NEXT: s_xor_b32 s4, s4, s5
@@ -3081,13 +3076,13 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
;
; GFX9-LABEL: sdiv_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s4, s6
+; GFX9-NEXT: s_sext_i32_i16 s4, s2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT: s_sext_i32_i16 s5, s2
+; GFX9-NEXT: s_sext_i32_i16 s5, s0
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5
; GFX9-NEXT: s_xor_b32 s4, s5, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
@@ -3099,61 +3094,61 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX9-NEXT: s_cselect_b32 s4, s8, 0
-; GFX9-NEXT: s_ashr_i32 s5, s6, 16
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5
; GFX9-NEXT: s_ashr_i32 s2, s2, 16
-; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
+; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s5
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: v_add_u32_e32 v3, s4, v3
+; GFX9-NEXT: s_xor_b32 s0, s0, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_sext_i32_i16 s2, s3
; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1
-; GFX9-NEXT: s_or_b32 s2, s2, 1
+; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT: v_add_u32_e32 v3, s4, v3
+; GFX9-NEXT: s_or_b32 s0, s0, 1
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT: s_sext_i32_i16 s4, s7
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT: s_cselect_b32 s2, s2, 0
-; GFX9-NEXT: v_add_u32_e32 v4, s2, v4
-; GFX9-NEXT: s_sext_i32_i16 s2, s3
-; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2
+; GFX9-NEXT: s_cselect_b32 s0, s0, 0
+; GFX9-NEXT: v_add_u32_e32 v4, s0, v4
+; GFX9-NEXT: s_sext_i32_i16 s0, s1
+; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s2, s2, 1
+; GFX9-NEXT: s_xor_b32 s0, s0, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s0, s0, 1
; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5
-; GFX9-NEXT: s_cselect_b32 s2, s2, 0
-; GFX9-NEXT: s_ashr_i32 s4, s7, 16
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT: v_add_u32_e32 v1, s2, v5
+; GFX9-NEXT: s_cselect_b32 s0, s0, 0
; GFX9-NEXT: s_ashr_i32 s2, s3, 16
-; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: v_add_u32_e32 v1, s0, v5
+; GFX9-NEXT: s_ashr_i32 s0, s1, 16
+; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s4, s2, 1
+; GFX9-NEXT: s_xor_b32 s0, s0, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s2, s0, 1
; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6
; GFX9-NEXT: v_trunc_f32_e32 v6, v6
; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5
; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v0|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s4, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v6
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s2, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v6
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
%r = sdiv <4 x i16> %x, %y
store <4 x i16> %r, ptr addrspace(1) %out
@@ -3271,55 +3266,53 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
;
; GFX6-LABEL: srem_v4i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_sext_i32_i16 s0, s8
-; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX6-NEXT: s_sext_i32_i16 s1, s6
-; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s1
-; GFX6-NEXT: s_xor_b32 s0, s1, s0
+; GFX6-NEXT: s_sext_i32_i16 s4, s10
+; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX6-NEXT: s_sext_i32_i16 s5, s8
+; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
+; GFX6-NEXT: s_xor_b32 s4, s5, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT: s_ashr_i32 s0, s0, 30
-; GFX6-NEXT: s_or_b32 s10, s0, 1
+; GFX6-NEXT: s_ashr_i32 s4, s4, 30
+; GFX6-NEXT: s_or_b32 s6, s4, 1
; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX6-NEXT: v_trunc_f32_e32 v2, v2
; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; GFX6-NEXT: s_cselect_b32 s0, s10, 0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; GFX6-NEXT: s_mov_b32 s0, s4
-; GFX6-NEXT: s_ashr_i32 s4, s8, 16
+; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT: s_cselect_b32 s4, s6, 0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2
+; GFX6-NEXT: s_ashr_i32 s4, s10, 16
; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
-; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_ashr_i32 s5, s6, 16
+; GFX6-NEXT: s_ashr_i32 s5, s8, 16
; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5
+; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10
; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1
-; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8
; GFX6-NEXT: s_xor_b32 s4, s5, s4
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
+; GFX6-NEXT: s_lshr_b32 s6, s8, 16
; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3
; GFX6-NEXT: v_trunc_f32_e32 v3, v3
; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2
; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
-; GFX6-NEXT: s_lshr_b32 s10, s6, 16
-; GFX6-NEXT: s_lshr_b32 s8, s8, 16
-; GFX6-NEXT: s_or_b32 s6, s4, 1
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT: s_lshr_b32 s7, s10, 16
+; GFX6-NEXT: s_or_b32 s8, s4, 1
; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1|
; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT: s_cselect_b32 s4, s6, 0
+; GFX6-NEXT: s_cselect_b32 s4, s8, 0
; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3
-; GFX6-NEXT: s_sext_i32_i16 s4, s9
-; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8
+; GFX6-NEXT: s_sext_i32_i16 s4, s11
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7
; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT: s_sext_i32_i16 s5, s7
+; GFX6-NEXT: s_sext_i32_i16 s5, s9
; GFX6-NEXT: s_xor_b32 s4, s5, s4
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v1
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1
; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
@@ -3333,30 +3326,30 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX6-NEXT: s_cselect_b32 s4, s6, 0
; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4
-; GFX6-NEXT: s_ashr_i32 s4, s9, 16
+; GFX6-NEXT: s_ashr_i32 s4, s11, 16
; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT: s_ashr_i32 s5, s7, 16
+; GFX6-NEXT: s_ashr_i32 s5, s9, 16
; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5
; GFX6-NEXT: s_xor_b32 s4, s5, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
-; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9
-; GFX6-NEXT: s_lshr_b32 s6, s7, 16
+; GFX6-NEXT: s_lshr_b32 s6, s9, 16
+; GFX6-NEXT: s_lshr_b32 s7, s11, 16
; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v5, v5
; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4
; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT: s_lshr_b32 s8, s9, 16
-; GFX6-NEXT: s_or_b32 s9, s4, 1
+; GFX6-NEXT: s_or_b32 s8, s4, 1
; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2|
; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT: s_cselect_b32 s4, s9, 0
+; GFX6-NEXT: s_cselect_b32 s4, s8, 0
; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v5
-; GFX6-NEXT: v_mul_lo_u32 v2, v2, s8
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s7, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11
+; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
@@ -3365,13 +3358,13 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
;
; GFX9-LABEL: srem_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s8, s6
+; GFX9-NEXT: s_sext_i32_i16 s8, s2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8
-; GFX9-NEXT: s_sext_i32_i16 s9, s2
+; GFX9-NEXT: s_sext_i32_i16 s9, s0
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9
; GFX9-NEXT: s_xor_b32 s4, s9, s8
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
@@ -3383,69 +3376,69 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX9-NEXT: s_cselect_b32 s4, s10, 0
-; GFX9-NEXT: s_ashr_i32 s10, s2, 16
-; GFX9-NEXT: s_ashr_i32 s2, s6, 16
+; GFX9-NEXT: s_ashr_i32 s10, s0, 16
+; GFX9-NEXT: s_ashr_i32 s0, s2, 16
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX9-NEXT: s_xor_b32 s2, s10, s0
+; GFX9-NEXT: s_ashr_i32 s2, s2, 30
; GFX9-NEXT: v_add_u32_e32 v1, s4, v3
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s10
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT: s_xor_b32 s4, s10, s2
-; GFX9-NEXT: s_ashr_i32 s4, s4, 30
-; GFX9-NEXT: s_or_b32 s6, s4, 1
+; GFX9-NEXT: s_or_b32 s2, s2, 1
+; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8
+; GFX9-NEXT: s_sext_i32_i16 s8, s1
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3
+; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT: s_cselect_b32 s4, s6, 0
-; GFX9-NEXT: s_sext_i32_i16 s6, s7
-; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s6
-; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8
-; GFX9-NEXT: s_sext_i32_i16 s8, s3
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v4
+; GFX9-NEXT: s_cselect_b32 s2, s2, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s2, v4
+; GFX9-NEXT: s_sext_i32_i16 s2, s3
+; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2
; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s8
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT: s_xor_b32 s0, s8, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT: s_xor_b32 s2, s8, s6
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s0, s0, 1
+; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0
; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4
-; GFX9-NEXT: s_or_b32 s2, s2, 1
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3|
-; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT: s_cselect_b32 s2, s2, 0
-; GFX9-NEXT: s_ashr_i32 s4, s7, 16
; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5
-; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s4
-; GFX9-NEXT: s_ashr_i32 s5, s3, 16
-; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0
-; GFX9-NEXT: v_add_u32_e32 v3, s2, v5
-; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT: s_cselect_b32 s0, s0, 0
+; GFX9-NEXT: s_ashr_i32 s3, s3, 16
+; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3
+; GFX9-NEXT: v_add_u32_e32 v3, s0, v5
+; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2
+; GFX9-NEXT: s_ashr_i32 s2, s1, 16
+; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4
-; GFX9-NEXT: s_xor_b32 s2, s5, s4
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6
+; GFX9-NEXT: s_xor_b32 s0, s2, s3
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s4, s0, 1
; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6
; GFX9-NEXT: v_trunc_f32_e32 v6, v6
; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5
; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6
-; GFX9-NEXT: s_or_b32 s6, s2, 1
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s6, 0
-; GFX9-NEXT: v_add_u32_e32 v4, s2, v6
-; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s4, 0
+; GFX9-NEXT: v_add_u32_e32 v4, s0, v6
+; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3
; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1
; GFX9-NEXT: v_sub_u32_e32 v1, s8, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_sub_u32_e32 v3, s5, v4
+; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
%r = srem <4 x i16> %x, %y
store <4 x i16> %r, ptr addrspace(1) %out
@@ -3841,48 +3834,46 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
;
; GFX6-LABEL: udiv_v3i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s0, s8, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0
-; GFX6-NEXT: s_mov_b32 s0, s4
-; GFX6-NEXT: s_and_b32 s1, s6, 0xffff
+; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5
+; GFX6-NEXT: s_lshr_b32 s5, s10, 16
+; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5
+; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
; GFX6-NEXT: s_lshr_b32 s4, s8, 16
-; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4
-; GFX6-NEXT: s_lshr_b32 s4, s6, 16
; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3
-; GFX6-NEXT: v_trunc_f32_e32 v2, v2
-; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT: v_trunc_f32_e32 v3, v3
+; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
-; GFX6-NEXT: s_and_b32 s4, s9, 0xffff
-; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
-; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4
+; GFX6-NEXT: s_and_b32 s4, s11, 0xffff
+; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3
+; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4
; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT: s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT: s_and_b32 s4, s9, 0xffff
+; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6
; GFX6-NEXT: v_trunc_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -3890,47 +3881,48 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
;
; GFX9-LABEL: udiv_v3i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
-; GFX9-NEXT: s_and_b32 s5, s6, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
-; GFX9-NEXT: s_lshr_b32 s6, s6, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: s_and_b32 s7, s2, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT: s_and_b32 s6, s0, 0xffff
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: s_and_b32 s2, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s3, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: s_and_b32 s2, s3, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s1, 0xffff
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2
; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v6, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v6, v2, s[6:7] offset:4
+; GFX9-NEXT: global_store_dword v6, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%r = udiv <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4010,54 +4002,52 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
;
; GFX6-LABEL: urem_v3i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_lshr_b32 s6, s10, 16
-; GFX6-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1
-; GFX6-NEXT: s_and_b32 s1, s10, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1
-; GFX6-NEXT: s_lshr_b32 s7, s4, 16
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s7
-; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6
-; GFX6-NEXT: s_mov_b32 s0, s8
-; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
-; GFX6-NEXT: v_trunc_f32_e32 v2, v2
-; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v2
+; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5
+; GFX6-NEXT: s_lshr_b32 s5, s10, 16
+; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5
+; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT: s_lshr_b32 s4, s8, 16
+; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT: v_trunc_f32_e32 v3, v3
+; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3
; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v4
-; GFX6-NEXT: s_mov_b32 s1, s9
-; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GFX6-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4
+; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
-; GFX6-NEXT: s_and_b32 s4, s5, 0xffff
-; GFX6-NEXT: v_mad_f32 v2, -v1, v4, v3
-; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4
-; GFX6-NEXT: s_and_b32 s4, s11, 0xffff
-; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v0
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v3
+; GFX6-NEXT: s_and_b32 s6, s11, 0xffff
+; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4
+; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6
+; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10
+; GFX6-NEXT: s_and_b32 s6, s9, 0xffff
+; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s6
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6
; GFX6-NEXT: v_trunc_f32_e32 v2, v2
-; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v2
+; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT: v_mad_f32 v2, -v2, v3, v5
-; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, v2, s5
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1
+; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5
+; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5
+; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s11, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -4065,34 +4055,33 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
;
; GFX9-LABEL: urem_v3i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
-; GFX9-NEXT: s_and_b32 s5, s6, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4
-; GFX9-NEXT: s_lshr_b32 s6, s6, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: s_and_b32 s9, s2, 0xffff
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT: s_and_b32 s8, s0, 0xffff
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4
-; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
-; GFX9-NEXT: s_and_b32 s5, s7, 0xffff
+; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5
-; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
@@ -4101,17 +4090,18 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3
+; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2
+; GFX9-NEXT: v_sub_u32_e32 v1, s0, v1
+; GFX9-NEXT: v_sub_u32_e32 v2, s1, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4
+; GFX9-NEXT: global_store_dword v3, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%r = urem <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4197,47 +4187,46 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
;
; GFX6-LABEL: sdiv_v3i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_sext_i32_i16 s7, s10
-; GFX6-NEXT: s_sext_i32_i16 s6, s4
-; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6
-; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s7
-; GFX6-NEXT: s_xor_b32 s6, s7, s6
-; GFX6-NEXT: s_ashr_i32 s6, s6, 30
+; GFX6-NEXT: s_sext_i32_i16 s4, s10
+; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX6-NEXT: s_sext_i32_i16 s5, s8
+; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
+; GFX6-NEXT: s_xor_b32 s4, s5, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT: s_mov_b32 s0, s8
-; GFX6-NEXT: s_or_b32 s8, s6, 1
-; GFX6-NEXT: s_sext_i32_i16 s5, s5
+; GFX6-NEXT: s_ashr_i32 s4, s4, 30
+; GFX6-NEXT: s_or_b32 s6, s4, 1
; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX6-NEXT: v_trunc_f32_e32 v2, v2
; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
-; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT: s_cselect_b32 s6, s8, 0
-; GFX6-NEXT: s_ashr_i32 s4, s4, 16
-; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v2
-; GFX6-NEXT: s_ashr_i32 s6, s10, 16
-; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6
+; GFX6-NEXT: s_cselect_b32 s4, s6, 0
+; GFX6-NEXT: s_ashr_i32 s5, s10, 16
+; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v2
+; GFX6-NEXT: s_ashr_i32 s4, s8, 16
+; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
-; GFX6-NEXT: s_xor_b32 s4, s6, s4
+; GFX6-NEXT: s_xor_b32 s4, s4, s5
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
-; GFX6-NEXT: s_or_b32 s4, s4, 1
+; GFX6-NEXT: s_or_b32 s6, s4, 1
; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3
; GFX6-NEXT: v_trunc_f32_e32 v3, v3
; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2
+; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0|
+; GFX6-NEXT: s_sext_i32_i16 s5, s11
; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5
-; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec
-; GFX6-NEXT: s_cselect_b32 s4, s4, 0
+; GFX6-NEXT: s_cselect_b32 s4, s6, 0
; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v3
-; GFX6-NEXT: s_sext_i32_i16 s4, s11
+; GFX6-NEXT: s_sext_i32_i16 s4, s9
; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0
; GFX6-NEXT: s_xor_b32 s4, s4, s5
@@ -4250,7 +4239,6 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX6-NEXT: s_cselect_b32 s4, s6, 0
-; GFX6-NEXT: s_mov_b32 s1, s9
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -4261,13 +4249,13 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
;
; GFX9-LABEL: sdiv_v3i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s4, s6
+; GFX9-NEXT: s_sext_i32_i16 s4, s2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT: s_sext_i32_i16 s5, s2
+; GFX9-NEXT: s_sext_i32_i16 s5, s0
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s5
; GFX9-NEXT: s_xor_b32 s4, s5, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
@@ -4279,44 +4267,44 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX9-NEXT: s_cselect_b32 s4, s8, 0
-; GFX9-NEXT: s_ashr_i32 s5, s6, 16
-; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5
; GFX9-NEXT: s_ashr_i32 s2, s2, 16
+; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
; GFX9-NEXT: v_add_u32_e32 v2, s4, v3
-; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s5
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s2, s2, 1
+; GFX9-NEXT: s_xor_b32 s0, s0, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_sext_i32_i16 s2, s3
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3
+; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT: s_or_b32 s0, s0, 1
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT: s_sext_i32_i16 s4, s7
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT: s_cselect_b32 s2, s2, 0
-; GFX9-NEXT: v_add_u32_e32 v3, s2, v4
-; GFX9-NEXT: s_sext_i32_i16 s2, s3
-; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2
+; GFX9-NEXT: s_cselect_b32 s0, s0, 0
+; GFX9-NEXT: v_add_u32_e32 v3, s0, v4
+; GFX9-NEXT: s_sext_i32_i16 s0, s1
+; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s4, s2, 1
+; GFX9-NEXT: s_xor_b32 s0, s0, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s2, s0, 1
; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4
; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v0|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s4, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v5
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s2, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v5
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX9-NEXT: global_store_short v1, v0, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[6:7] offset:4
+; GFX9-NEXT: global_store_dword v1, v2, s[6:7]
; GFX9-NEXT: s_endpgm
%r = sdiv <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4408,70 +4396,68 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
;
; GFX6-LABEL: srem_v3i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_sext_i32_i16 s0, s8
-; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX6-NEXT: s_sext_i32_i16 s1, s6
-; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s1
-; GFX6-NEXT: s_xor_b32 s0, s1, s0
+; GFX6-NEXT: s_sext_i32_i16 s4, s10
+; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX6-NEXT: s_sext_i32_i16 s5, s8
+; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
+; GFX6-NEXT: s_xor_b32 s4, s5, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT: s_ashr_i32 s0, s0, 30
-; GFX6-NEXT: s_or_b32 s10, s0, 1
+; GFX6-NEXT: s_ashr_i32 s4, s4, 30
+; GFX6-NEXT: s_or_b32 s6, s4, 1
; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX6-NEXT: v_trunc_f32_e32 v2, v2
; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; GFX6-NEXT: s_cselect_b32 s0, s10, 0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; GFX6-NEXT: s_mov_b32 s0, s4
-; GFX6-NEXT: s_ashr_i32 s4, s8, 16
+; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT: s_cselect_b32 s4, s6, 0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2
+; GFX6-NEXT: s_ashr_i32 s4, s10, 16
; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
-; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_ashr_i32 s5, s6, 16
+; GFX6-NEXT: s_ashr_i32 s5, s8, 16
; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5
+; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10
; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1
-; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8
; GFX6-NEXT: s_xor_b32 s4, s5, s4
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
+; GFX6-NEXT: s_lshr_b32 s6, s8, 16
; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3
; GFX6-NEXT: v_trunc_f32_e32 v3, v3
; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2
; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
-; GFX6-NEXT: s_lshr_b32 s10, s6, 16
-; GFX6-NEXT: s_lshr_b32 s8, s8, 16
-; GFX6-NEXT: s_or_b32 s6, s4, 1
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT: s_lshr_b32 s7, s10, 16
+; GFX6-NEXT: s_or_b32 s8, s4, 1
; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1|
; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT: s_cselect_b32 s4, s6, 0
+; GFX6-NEXT: s_cselect_b32 s4, s8, 0
; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3
-; GFX6-NEXT: s_sext_i32_i16 s4, s9
+; GFX6-NEXT: s_sext_i32_i16 s4, s11
; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT: s_sext_i32_i16 s5, s7
+; GFX6-NEXT: s_sext_i32_i16 s5, s9
; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5
; GFX6-NEXT: s_xor_b32 s4, s5, s4
; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
; GFX6-NEXT: s_ashr_i32 s4, s4, 30
-; GFX6-NEXT: s_or_b32 s6, s4, 1
-; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7
+; GFX6-NEXT: s_or_b32 s7, s4, 1
; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4
; GFX6-NEXT: v_trunc_f32_e32 v4, v4
; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3
; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT: s_cselect_b32 s4, s6, 0
+; GFX6-NEXT: s_cselect_b32 s4, s7, 0
; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v4
-; GFX6-NEXT: v_mul_lo_u32 v2, v2, s9
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s10, v1
+; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
@@ -4480,12 +4466,12 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
;
; GFX9-LABEL: srem_v3i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s8, s6
+; GFX9-NEXT: s_sext_i32_i16 s8, s2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8
-; GFX9-NEXT: s_sext_i32_i16 s9, s2
+; GFX9-NEXT: s_sext_i32_i16 s9, s0
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9
; GFX9-NEXT: s_xor_b32 s4, s9, s8
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
@@ -4497,51 +4483,51 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX9-NEXT: s_cselect_b32 s4, s10, 0
-; GFX9-NEXT: s_ashr_i32 s10, s2, 16
-; GFX9-NEXT: s_ashr_i32 s2, s6, 16
+; GFX9-NEXT: s_ashr_i32 s10, s0, 16
+; GFX9-NEXT: s_ashr_i32 s0, s2, 16
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX9-NEXT: s_xor_b32 s2, s10, s0
+; GFX9-NEXT: s_ashr_i32 s2, s2, 30
; GFX9-NEXT: v_add_u32_e32 v1, s4, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s10
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT: s_xor_b32 s4, s10, s2
-; GFX9-NEXT: s_ashr_i32 s4, s4, 30
-; GFX9-NEXT: s_or_b32 s6, s4, 1
+; GFX9-NEXT: s_or_b32 s2, s2, 1
+; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8
; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT: s_cselect_b32 s4, s6, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v3
-; GFX9-NEXT: s_sext_i32_i16 s4, s7
-; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4
-; GFX9-NEXT: s_sext_i32_i16 s5, s3
-; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
+; GFX9-NEXT: s_cselect_b32 s2, s2, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
+; GFX9-NEXT: s_sext_i32_i16 s2, s3
+; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2
+; GFX9-NEXT: s_sext_i32_i16 s3, s1
+; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s3
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2
-; GFX9-NEXT: s_xor_b32 s2, s5, s4
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s6, s2, 1
+; GFX9-NEXT: s_xor_b32 s0, s3, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s4, s0, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s6, 0
-; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8
-; GFX9-NEXT: v_add_u32_e32 v2, s2, v4
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s4, 0
+; GFX9-NEXT: v_add_u32_e32 v2, s0, v4
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0
-; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
+; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4
+; GFX9-NEXT: global_store_dword v3, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%r = srem <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -5512,16 +5498,15 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
; GFX6-LABEL: udiv_v2i32_pow2k_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: s_lshr_b32 s0, s2, 12
-; GFX6-NEXT: s_lshr_b32 s1, s3, 12
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT: s_lshr_b32 s4, s4, 12
+; GFX6-NEXT: s_lshr_b32 s5, s5, 12
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v2i32_pow2k_denom:
@@ -5555,19 +5540,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_lshr_b32 s0, s2, 12
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT: s_lshr_b32 s4, s4, 12
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
@@ -5662,31 +5646,29 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
;
; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GFX6-NEXT: s_mov_b32 s11, 0xf000
-; GFX6-NEXT: s_mov_b32 s10, -1
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s0
+; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0
-; GFX6-NEXT: s_sub_i32 s2, 0, s0
-; GFX6-NEXT: s_mov_b32 s8, s4
-; GFX6-NEXT: s_mov_b32 s9, s5
+; GFX6-NEXT: s_sub_i32 s1, 0, s0
+; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s11
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0
-; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s1
-; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2
+; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
; GFX6-NEXT: s_mul_i32 s1, s1, s0
-; GFX6-NEXT: s_sub_i32 s1, s6, s1
+; GFX6-NEXT: s_sub_i32 s1, s8, s1
; GFX6-NEXT: s_sub_i32 s3, s1, s0
; GFX6-NEXT: s_cmp_ge_u32 s1, s0
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
@@ -5701,10 +5683,10 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT: v_mul_hi_u32 v1, s7, v1
+; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1
; GFX6-NEXT: v_readfirstlane_b32 s0, v1
; GFX6-NEXT: s_mul_i32 s0, s0, s2
-; GFX6-NEXT: s_sub_i32 s0, s7, s0
+; GFX6-NEXT: s_sub_i32 s0, s9, s0
; GFX6-NEXT: s_sub_i32 s1, s0, s2
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1
; GFX6-NEXT: s_cmp_ge_u32 s0, s2
@@ -5715,19 +5697,19 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_cmp_ge_u32 s0, s2
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s0
+; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s1
+; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_sub_i32 s4, 0, s7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
@@ -5739,37 +5721,37 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s4, s4, s5
; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
; GFX9-NEXT: s_add_i32 s5, s5, s4
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5
+; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5
; GFX9-NEXT: s_mul_i32 s5, s4, s7
-; GFX9-NEXT: s_sub_i32 s2, s2, s5
+; GFX9-NEXT: s_sub_i32 s0, s0, s5
; GFX9-NEXT: s_add_i32 s9, s4, 1
-; GFX9-NEXT: s_sub_i32 s5, s2, s7
-; GFX9-NEXT: s_cmp_ge_u32 s2, s7
+; GFX9-NEXT: s_sub_i32 s5, s0, s7
+; GFX9-NEXT: s_cmp_ge_u32 s0, s7
; GFX9-NEXT: s_cselect_b32 s4, s9, s4
-; GFX9-NEXT: s_cselect_b32 s2, s5, s2
+; GFX9-NEXT: s_cselect_b32 s0, s5, s0
; GFX9-NEXT: s_add_i32 s5, s4, 1
-; GFX9-NEXT: s_cmp_ge_u32 s2, s7
+; GFX9-NEXT: s_cmp_ge_u32 s0, s7
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
-; GFX9-NEXT: s_cselect_b32 s2, s5, s4
+; GFX9-NEXT: s_cselect_b32 s0, s5, s4
; GFX9-NEXT: s_sub_i32 s4, 0, s6
; GFX9-NEXT: s_mul_i32 s4, s4, s8
; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s3, s8
+; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8
; GFX9-NEXT: s_mul_i32 s5, s4, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s5
+; GFX9-NEXT: s_sub_i32 s1, s1, s5
; GFX9-NEXT: s_add_i32 s7, s4, 1
-; GFX9-NEXT: s_sub_i32 s5, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
+; GFX9-NEXT: s_sub_i32 s5, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
; GFX9-NEXT: s_cselect_b32 s4, s7, s4
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
; GFX9-NEXT: s_add_i32 s5, s4, 1
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s5, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s5, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = udiv <2 x i32> %x, %shl.y
@@ -5908,16 +5890,15 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
; GFX6-LABEL: urem_v2i32_pow2k_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: s_and_b32 s0, s2, 0xfff
-; GFX6-NEXT: s_and_b32 s1, s3, 0xfff
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT: s_and_b32 s4, s4, 0xfff
+; GFX6-NEXT: s_and_b32 s5, s5, 0xfff
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_v2i32_pow2k_denom:
@@ -6004,67 +5985,64 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
;
; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s0
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT: s_sub_i32 s0, 0, s6
-; GFX6-NEXT: s_lshl_b32 s8, 0x1000, s1
-; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8
+; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT: s_sub_i32 s6, 0, s2
+; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0
-; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s4, v0
-; GFX6-NEXT: s_mul_i32 s4, s4, s6
-; GFX6-NEXT: s_sub_i32 s2, s2, s4
-; GFX6-NEXT: s_sub_i32 s4, s2, s6
-; GFX6-NEXT: s_cmp_ge_u32 s2, s6
-; GFX6-NEXT: s_cselect_b32 s2, s4, s2
-; GFX6-NEXT: s_sub_i32 s4, s2, s6
-; GFX6-NEXT: s_cmp_ge_u32 s2, s6
-; GFX6-NEXT: s_cselect_b32 s2, s4, s2
-; GFX6-NEXT: s_sub_i32 s4, 0, s8
-; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1
-; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: v_readfirstlane_b32 s6, v0
+; GFX6-NEXT: s_mul_i32 s6, s6, s2
+; GFX6-NEXT: s_sub_i32 s0, s0, s6
+; GFX6-NEXT: s_sub_i32 s6, s0, s2
+; GFX6-NEXT: s_cmp_ge_u32 s0, s2
+; GFX6-NEXT: s_cselect_b32 s0, s6, s0
+; GFX6-NEXT: s_sub_i32 s6, s0, s2
+; GFX6-NEXT: s_cmp_ge_u32 s0, s2
+; GFX6-NEXT: s_cselect_b32 s0, s6, s0
+; GFX6-NEXT: s_sub_i32 s2, 0, s3
+; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: s_mul_i32 s0, s0, s8
-; GFX6-NEXT: s_sub_i32 s0, s3, s0
-; GFX6-NEXT: s_sub_i32 s1, s0, s8
-; GFX6-NEXT: s_cmp_ge_u32 s0, s8
-; GFX6-NEXT: s_cselect_b32 s0, s1, s0
-; GFX6-NEXT: s_sub_i32 s1, s0, s8
-; GFX6-NEXT: s_cmp_ge_u32 s0, s8
-; GFX6-NEXT: s_cselect_b32 s0, s1, s0
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0
+; GFX6-NEXT: v_readfirstlane_b32 s2, v0
+; GFX6-NEXT: s_mul_i32 s2, s2, s3
+; GFX6-NEXT: s_sub_i32 s1, s1, s2
+; GFX6-NEXT: s_sub_i32 s2, s1, s3
+; GFX6-NEXT: s_cmp_ge_u32 s1, s3
+; GFX6-NEXT: s_cselect_b32 s1, s2, s1
+; GFX6-NEXT: s_sub_i32 s2, s1, s3
+; GFX6-NEXT: s_cmp_ge_u32 s1, s3
+; GFX6-NEXT: s_cselect_b32 s1, s2, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s0
+; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s1
+; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_sub_i32 s4, 0, s7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
@@ -6076,33 +6054,33 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s4, s4, s5
; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
; GFX9-NEXT: s_add_i32 s5, s5, s4
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5
+; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5
; GFX9-NEXT: s_mul_i32 s4, s4, s7
-; GFX9-NEXT: s_sub_i32 s2, s2, s4
-; GFX9-NEXT: s_sub_i32 s4, s2, s7
-; GFX9-NEXT: s_cmp_ge_u32 s2, s7
-; GFX9-NEXT: s_cselect_b32 s2, s4, s2
-; GFX9-NEXT: s_sub_i32 s4, s2, s7
-; GFX9-NEXT: s_cmp_ge_u32 s2, s7
+; GFX9-NEXT: s_sub_i32 s0, s0, s4
+; GFX9-NEXT: s_sub_i32 s4, s0, s7
+; GFX9-NEXT: s_cmp_ge_u32 s0, s7
+; GFX9-NEXT: s_cselect_b32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s4, s0, s7
+; GFX9-NEXT: s_cmp_ge_u32 s0, s7
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
-; GFX9-NEXT: s_cselect_b32 s2, s4, s2
+; GFX9-NEXT: s_cselect_b32 s0, s4, s0
; GFX9-NEXT: s_sub_i32 s4, 0, s6
; GFX9-NEXT: s_mul_i32 s4, s4, s8
; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s3, s8
+; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8
; GFX9-NEXT: s_mul_i32 s4, s4, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s4
-; GFX9-NEXT: s_sub_i32 s4, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
-; GFX9-NEXT: s_sub_i32 s4, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_sub_i32 s1, s1, s4
+; GFX9-NEXT: s_sub_i32 s4, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = urem <2 x i32> %x, %shl.y
@@ -6291,22 +6269,21 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
; GFX6-LABEL: sdiv_v2i32_pow2k_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: s_ashr_i32 s0, s2, 31
-; GFX6-NEXT: s_ashr_i32 s1, s3, 31
-; GFX6-NEXT: s_lshr_b32 s0, s0, 20
-; GFX6-NEXT: s_lshr_b32 s1, s1, 20
-; GFX6-NEXT: s_add_i32 s0, s2, s0
-; GFX6-NEXT: s_add_i32 s1, s3, s1
-; GFX6-NEXT: s_ashr_i32 s0, s0, 12
-; GFX6-NEXT: s_ashr_i32 s1, s1, 12
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT: s_ashr_i32 s6, s4, 31
+; GFX6-NEXT: s_ashr_i32 s7, s5, 31
+; GFX6-NEXT: s_lshr_b32 s6, s6, 20
+; GFX6-NEXT: s_lshr_b32 s7, s7, 20
+; GFX6-NEXT: s_add_i32 s4, s4, s6
+; GFX6-NEXT: s_add_i32 s5, s5, s7
+; GFX6-NEXT: s_ashr_i32 s4, s4, 12
+; GFX6-NEXT: s_ashr_i32 s5, s5, 12
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
@@ -6346,22 +6323,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_hi_i32 v0, s3, v0
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_ashr_i32 s0, s2, 31
-; GFX6-NEXT: s_lshr_b32 s0, s0, 20
-; GFX6-NEXT: s_add_i32 s0, s2, s0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, s3, v0
-; GFX6-NEXT: s_ashr_i32 s0, s0, 12
+; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT: v_mul_hi_i32 v0, s5, v0
+; GFX6-NEXT: s_ashr_i32 s6, s4, 31
+; GFX6-NEXT: s_lshr_b32 s6, s6, 20
+; GFX6-NEXT: s_add_i32 s4, s4, s6
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s5, v0
+; GFX6-NEXT: s_ashr_i32 s4, s4, 12
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0
-; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
@@ -6477,138 +6453,136 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
;
; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s6
-; GFX6-NEXT: s_abs_i32 s8, s6
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT: s_sub_i32 s0, 0, s8
-; GFX6-NEXT: s_lshl_b32 s9, 0x1000, s7
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2
+; GFX6-NEXT: s_abs_i32 s6, s2
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX6-NEXT: s_sub_i32 s7, 0, s6
+; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0
-; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: v_mul_lo_u32 v1, s7, v0
+; GFX6-NEXT: s_abs_i32 s7, s0
+; GFX6-NEXT: s_xor_b32 s0, s0, s2
+; GFX6-NEXT: s_ashr_i32 s0, s0, 31
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_abs_i32 s4, s2
-; GFX6-NEXT: s_xor_b32 s2, s2, s6
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT: s_ashr_i32 s2, s2, 31
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: v_readfirstlane_b32 s5, v0
-; GFX6-NEXT: s_mul_i32 s5, s5, s8
-; GFX6-NEXT: s_sub_i32 s4, s4, s5
-; GFX6-NEXT: s_sub_i32 s5, s4, s8
+; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0
+; GFX6-NEXT: v_readfirstlane_b32 s2, v0
+; GFX6-NEXT: s_mul_i32 s2, s2, s6
+; GFX6-NEXT: s_sub_i32 s2, s7, s2
+; GFX6-NEXT: s_sub_i32 s7, s2, s6
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT: s_cmp_ge_u32 s4, s8
+; GFX6-NEXT: s_cmp_ge_u32 s2, s6
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT: s_cselect_b32 s4, s5, s4
+; GFX6-NEXT: s_cselect_b32 s2, s7, s2
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT: s_cmp_ge_u32 s4, s8
+; GFX6-NEXT: s_cmp_ge_u32 s2, s6
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX6-NEXT: s_abs_i32 s8, s9
-; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_sub_i32 s0, 0, s8
+; GFX6-NEXT: s_abs_i32 s2, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2
+; GFX6-NEXT: s_sub_i32 s6, 0, s2
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT: s_xor_b32 s3, s1, s3
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: s_abs_i32 s1, s3
-; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0
+; GFX6-NEXT: s_abs_i32 s1, s1
+; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
+; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT: v_mul_lo_u32 v3, s0, v2
-; GFX6-NEXT: s_xor_b32 s0, s3, s9
-; GFX6-NEXT: s_ashr_i32 s0, s0, 31
+; GFX6-NEXT: s_ashr_i32 s3, s3, 31
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: v_mul_lo_u32 v3, s6, v2
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v1
-; GFX6-NEXT: s_mul_i32 s2, s2, s8
-; GFX6-NEXT: s_sub_i32 s1, s1, s2
-; GFX6-NEXT: s_sub_i32 s2, s1, s8
+; GFX6-NEXT: v_readfirstlane_b32 s0, v1
+; GFX6-NEXT: s_mul_i32 s0, s0, s2
+; GFX6-NEXT: s_sub_i32 s0, s1, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s2
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1
-; GFX6-NEXT: s_cmp_ge_u32 s1, s8
+; GFX6-NEXT: s_cmp_ge_u32 s0, s2
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT: s_cselect_b32 s1, s2, s1
+; GFX6-NEXT: s_cselect_b32 s0, s1, s0
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1
-; GFX6-NEXT: s_cmp_ge_u32 s1, s8
+; GFX6-NEXT: s_cmp_ge_u32 s0, s2
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT: v_xor_b32_e32 v1, s0, v1
-; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1
+; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1
+; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6
-; GFX9-NEXT: s_abs_i32 s8, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
-; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s7
-; GFX9-NEXT: s_abs_i32 s5, s2
-; GFX9-NEXT: s_xor_b32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2
+; GFX9-NEXT: s_abs_i32 s6, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s3
+; GFX9-NEXT: s_abs_i32 s3, s0
+; GFX9-NEXT: s_xor_b32 s0, s0, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s6, 0, s8
-; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_sub_i32 s2, 0, s6
+; GFX9-NEXT: s_ashr_i32 s0, s0, 31
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s7, v0
-; GFX9-NEXT: s_mul_i32 s6, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7
-; GFX9-NEXT: s_mul_i32 s7, s6, s8
-; GFX9-NEXT: s_sub_i32 s5, s5, s7
-; GFX9-NEXT: s_add_i32 s9, s6, 1
-; GFX9-NEXT: s_sub_i32 s7, s5, s8
-; GFX9-NEXT: s_cmp_ge_u32 s5, s8
-; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_cselect_b32 s5, s7, s5
-; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s8
-; GFX9-NEXT: s_cselect_b32 s5, s7, s6
-; GFX9-NEXT: s_abs_i32 s6, s4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT: s_xor_b32 s5, s5, s2
-; GFX9-NEXT: s_sub_i32 s7, 0, s6
-; GFX9-NEXT: s_sub_i32 s2, s5, s2
+; GFX9-NEXT: v_readfirstlane_b32 s8, v0
+; GFX9-NEXT: s_mul_i32 s2, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s2, s8, s2
+; GFX9-NEXT: s_add_i32 s8, s8, s2
+; GFX9-NEXT: s_mul_hi_u32 s2, s3, s8
+; GFX9-NEXT: s_mul_i32 s8, s2, s6
+; GFX9-NEXT: s_sub_i32 s3, s3, s8
+; GFX9-NEXT: s_add_i32 s9, s2, 1
+; GFX9-NEXT: s_sub_i32 s8, s3, s6
+; GFX9-NEXT: s_cmp_ge_u32 s3, s6
+; GFX9-NEXT: s_cselect_b32 s2, s9, s2
+; GFX9-NEXT: s_cselect_b32 s3, s8, s3
+; GFX9-NEXT: s_add_i32 s8, s2, 1
+; GFX9-NEXT: s_cmp_ge_u32 s3, s6
+; GFX9-NEXT: s_cselect_b32 s6, s8, s2
+; GFX9-NEXT: s_abs_i32 s8, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT: s_xor_b32 s5, s6, s0
+; GFX9-NEXT: s_sub_i32 s6, 0, s8
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s4, s3, s4
-; GFX9-NEXT: s_abs_i32 s3, s3
-; GFX9-NEXT: s_ashr_i32 s4, s4, 31
+; GFX9-NEXT: s_sub_i32 s0, s5, s0
+; GFX9-NEXT: s_xor_b32 s4, s1, s7
+; GFX9-NEXT: s_abs_i32 s1, s1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT: s_ashr_i32 s4, s4, 31
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
-; GFX9-NEXT: s_mul_i32 s7, s7, s5
-; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7
-; GFX9-NEXT: s_add_i32 s5, s5, s7
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5
-; GFX9-NEXT: s_mul_i32 s7, s5, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s7
-; GFX9-NEXT: s_add_i32 s8, s5, 1
-; GFX9-NEXT: s_sub_i32 s7, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s5, s8, s5
-; GFX9-NEXT: s_cselect_b32 s3, s7, s3
+; GFX9-NEXT: s_mul_i32 s6, s6, s5
+; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6
+; GFX9-NEXT: s_add_i32 s5, s5, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
+; GFX9-NEXT: s_mul_i32 s6, s5, s8
+; GFX9-NEXT: s_sub_i32 s1, s1, s6
; GFX9-NEXT: s_add_i32 s7, s5, 1
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s7, s5
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_sub_i32 s6, s1, s8
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
+; GFX9-NEXT: s_cselect_b32 s5, s7, s5
+; GFX9-NEXT: s_cselect_b32 s1, s6, s1
+; GFX9-NEXT: s_add_i32 s6, s5, 1
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
+; GFX9-NEXT: s_cselect_b32 s1, s6, s5
+; GFX9-NEXT: s_xor_b32 s1, s1, s4
+; GFX9-NEXT: s_sub_i32 s1, s1, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = sdiv <2 x i32> %x, %shl.y
@@ -6798,24 +6772,23 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
; GFX6-LABEL: srem_v2i32_pow2k_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: s_ashr_i32 s0, s2, 31
-; GFX6-NEXT: s_ashr_i32 s1, s3, 31
-; GFX6-NEXT: s_lshr_b32 s0, s0, 20
-; GFX6-NEXT: s_lshr_b32 s1, s1, 20
-; GFX6-NEXT: s_add_i32 s0, s2, s0
-; GFX6-NEXT: s_add_i32 s1, s3, s1
-; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000
-; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000
-; GFX6-NEXT: s_sub_i32 s0, s2, s0
-; GFX6-NEXT: s_sub_i32 s1, s3, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT: s_ashr_i32 s6, s4, 31
+; GFX6-NEXT: s_lshr_b32 s6, s6, 20
+; GFX6-NEXT: s_ashr_i32 s7, s5, 31
+; GFX6-NEXT: s_add_i32 s6, s4, s6
+; GFX6-NEXT: s_lshr_b32 s7, s7, 20
+; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000
+; GFX6-NEXT: s_sub_i32 s4, s4, s6
+; GFX6-NEXT: s_add_i32 s6, s5, s7
+; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000
+; GFX6-NEXT: s_sub_i32 s5, s5, s6
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_v2i32_pow2k_denom:
@@ -6926,125 +6899,122 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
;
; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6
-; GFX6-NEXT: s_abs_i32 s6, s0
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT: s_sub_i32 s0, 0, s6
+; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2
+; GFX6-NEXT: s_abs_i32 s2, s2
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT: s_sub_i32 s6, 0, s2
+; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0
-; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s7
+; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0
+; GFX6-NEXT: s_abs_i32 s6, s0
+; GFX6-NEXT: s_ashr_i32 s0, s0, 31
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_abs_i32 s4, s2
-; GFX6-NEXT: s_ashr_i32 s2, s2, 31
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX6-NEXT: v_readfirstlane_b32 s7, v0
-; GFX6-NEXT: s_mul_i32 s7, s7, s6
-; GFX6-NEXT: s_sub_i32 s4, s4, s7
-; GFX6-NEXT: s_sub_i32 s7, s4, s6
-; GFX6-NEXT: s_cmp_ge_u32 s4, s6
-; GFX6-NEXT: s_cselect_b32 s4, s7, s4
-; GFX6-NEXT: s_sub_i32 s7, s4, s6
-; GFX6-NEXT: s_cmp_ge_u32 s4, s6
-; GFX6-NEXT: s_cselect_b32 s8, s7, s4
-; GFX6-NEXT: s_abs_i32 s9, s5
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9
-; GFX6-NEXT: s_sub_i32 s4, 0, s9
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: s_ashr_i32 s1, s3, 31
+; GFX6-NEXT: s_mul_i32 s7, s7, s2
+; GFX6-NEXT: s_sub_i32 s6, s6, s7
+; GFX6-NEXT: s_sub_i32 s7, s6, s2
+; GFX6-NEXT: s_cmp_ge_u32 s6, s2
+; GFX6-NEXT: s_cselect_b32 s6, s7, s6
+; GFX6-NEXT: s_sub_i32 s7, s6, s2
+; GFX6-NEXT: s_cmp_ge_u32 s6, s2
+; GFX6-NEXT: s_cselect_b32 s2, s7, s6
+; GFX6-NEXT: s_abs_i32 s3, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT: s_sub_i32 s6, 0, s3
+; GFX6-NEXT: s_abs_i32 s8, s1
+; GFX6-NEXT: s_xor_b32 s2, s2, s0
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT: s_sub_i32 s0, s2, s0
+; GFX6-NEXT: s_ashr_i32 s1, s1, 31
; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_abs_i32 s0, s3
-; GFX6-NEXT: s_xor_b32 s3, s8, s2
+; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: s_sub_i32 s2, s3, s2
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s3, v0
-; GFX6-NEXT: s_mul_i32 s3, s3, s9
-; GFX6-NEXT: s_sub_i32 s0, s0, s3
-; GFX6-NEXT: s_sub_i32 s3, s0, s9
-; GFX6-NEXT: s_cmp_ge_u32 s0, s9
-; GFX6-NEXT: s_cselect_b32 s0, s3, s0
-; GFX6-NEXT: s_sub_i32 s3, s0, s9
-; GFX6-NEXT: s_cmp_ge_u32 s0, s9
-; GFX6-NEXT: s_cselect_b32 s0, s3, s0
-; GFX6-NEXT: s_xor_b32 s0, s0, s1
-; GFX6-NEXT: s_sub_i32 s0, s0, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s2, v0
+; GFX6-NEXT: s_mul_i32 s2, s2, s3
+; GFX6-NEXT: s_sub_i32 s2, s8, s2
+; GFX6-NEXT: s_sub_i32 s8, s2, s3
+; GFX6-NEXT: s_cmp_ge_u32 s2, s3
+; GFX6-NEXT: s_cselect_b32 s2, s8, s2
+; GFX6-NEXT: s_sub_i32 s8, s2, s3
+; GFX6-NEXT: s_cmp_ge_u32 s2, s3
+; GFX6-NEXT: s_cselect_b32 s2, s8, s2
+; GFX6-NEXT: s_xor_b32 s2, s2, s1
+; GFX6-NEXT: s_sub_i32 s1, s2, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6
-; GFX9-NEXT: s_abs_i32 s6, s0
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s7
-; GFX9-NEXT: s_sub_i32 s7, 0, s6
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s5, s2, 31
+; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2
; GFX9-NEXT: s_abs_i32 s2, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT: s_sub_i32 s7, 0, s2
+; GFX9-NEXT: s_ashr_i32 s6, s0, 31
+; GFX9-NEXT: s_abs_i32 s0, s0
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s8, v0
; GFX9-NEXT: s_mul_i32 s7, s7, s8
; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7
; GFX9-NEXT: s_add_i32 s8, s8, s7
-; GFX9-NEXT: s_mul_hi_u32 s7, s2, s8
-; GFX9-NEXT: s_mul_i32 s7, s7, s6
-; GFX9-NEXT: s_sub_i32 s2, s2, s7
-; GFX9-NEXT: s_sub_i32 s7, s2, s6
-; GFX9-NEXT: s_cmp_ge_u32 s2, s6
-; GFX9-NEXT: s_cselect_b32 s2, s7, s2
-; GFX9-NEXT: s_sub_i32 s7, s2, s6
-; GFX9-NEXT: s_cmp_ge_u32 s2, s6
-; GFX9-NEXT: s_cselect_b32 s2, s7, s2
-; GFX9-NEXT: s_abs_i32 s4, s4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT: s_xor_b32 s2, s2, s5
-; GFX9-NEXT: s_sub_i32 s7, 0, s4
-; GFX9-NEXT: s_sub_i32 s2, s2, s5
+; GFX9-NEXT: s_mul_hi_u32 s7, s0, s8
+; GFX9-NEXT: s_mul_i32 s7, s7, s2
+; GFX9-NEXT: s_sub_i32 s0, s0, s7
+; GFX9-NEXT: s_sub_i32 s7, s0, s2
+; GFX9-NEXT: s_cmp_ge_u32 s0, s2
+; GFX9-NEXT: s_cselect_b32 s0, s7, s0
+; GFX9-NEXT: s_sub_i32 s7, s0, s2
+; GFX9-NEXT: s_cmp_ge_u32 s0, s2
+; GFX9-NEXT: s_cselect_b32 s0, s7, s0
+; GFX9-NEXT: s_abs_i32 s7, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT: s_xor_b32 s0, s0, s6
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT: s_sub_i32 s5, 0, s7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_ashr_i32 s6, s3, 31
-; GFX9-NEXT: s_abs_i32 s3, s3
+; GFX9-NEXT: s_sub_i32 s0, s0, s6
+; GFX9-NEXT: s_ashr_i32 s4, s1, 31
+; GFX9-NEXT: s_abs_i32 s1, s1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v0
-; GFX9-NEXT: s_mul_i32 s7, s7, s5
-; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7
-; GFX9-NEXT: s_add_i32 s5, s5, s7
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5
-; GFX9-NEXT: s_mul_i32 s5, s5, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s5
-; GFX9-NEXT: s_sub_i32 s5, s3, s4
-; GFX9-NEXT: s_cmp_ge_u32 s3, s4
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s5, s3, s4
-; GFX9-NEXT: s_cmp_ge_u32 s3, s4
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_xor_b32 s3, s3, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s6, v0
+; GFX9-NEXT: s_mul_i32 s5, s5, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT: s_add_i32 s6, s6, s5
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s6
+; GFX9-NEXT: s_mul_i32 s5, s5, s7
+; GFX9-NEXT: s_sub_i32 s1, s1, s5
+; GFX9-NEXT: s_sub_i32 s5, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s5, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, s4
+; GFX9-NEXT: s_sub_i32 s1, s1, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = srem <2 x i32> %x, %shl.y
@@ -10151,9 +10121,6 @@ define i64 @udiv_i64_9divbits(i8 %size) {
}
define <2 x i64> @srem_zero_zero() {
-; GCN-LABEL: kernel:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_endpgm
; GFX6-LABEL: srem_zero_zero:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 56ad037f65641..37f4094806637 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -257,29 +257,28 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX6-LABEL: build_v2i32_from_v4i16_shuffle:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: s_lshl_b32 s0, s3, 16
-; GFX6-NEXT: s_lshl_b32 s1, s2, 16
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT: s_lshl_b32 s5, s5, 16
+; GFX6-NEXT: s_lshl_b32 s4, s4, 16
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_lshl_b32 s0, s3, 16
-; GFX8-NEXT: s_lshl_b32 s1, s2, 16
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
index 61fb18e00917b..c46fcde739b1c 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
@@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_and_b32 s0, s3, 0x7fff7fff
-; CI-NEXT: s_and_b32 s1, s2, 0x7fff7fff
-; CI-NEXT: v_mov_b32_e32 v2, s1
-; CI-NEXT: v_mov_b32_e32 v3, s0
-; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_fabs_v4bf16:
@@ -234,13 +234,23 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_and_b32 s0, s3, 0x7fff7fff
-; VI-NEXT: s_and_b32 s1, s2, 0x7fff7fff
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_and_b32 s4, s3, 0x7fff
+; VI-NEXT: s_lshr_b32 s3, s3, 16
+; VI-NEXT: s_and_b32 s5, s2, 0x7fff
+; VI-NEXT: s_lshr_b32 s2, s2, 16
+; VI-NEXT: s_and_b32 s3, s3, 0x7fff
+; VI-NEXT: s_and_b32 s2, s2, 0x7fff
+; VI-NEXT: s_and_b32 s4, 0xffff, s4
+; VI-NEXT: s_and_b32 s5, 0xffff, s5
+; VI-NEXT: s_lshl_b32 s3, s3, 16
+; VI-NEXT: s_lshl_b32 s2, s2, 16
+; VI-NEXT: s_or_b32 s3, s4, s3
+; VI-NEXT: s_or_b32 s2, s5, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_fabs_v4bf16:
@@ -248,8 +258,14 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff
-; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
+; GFX9-NEXT: s_and_b32 s4, s3, 0x7fff
+; GFX9-NEXT: s_lshr_b32 s3, s3, 16
+; GFX9-NEXT: s_and_b32 s5, s2, 0x7fff
+; GFX9-NEXT: s_lshr_b32 s2, s2, 16
+; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff
+; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -259,8 +275,14 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
-; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff
+; GFX11-NEXT: s_and_b32 s4, s3, 0x7fff
+; GFX11-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-NEXT: s_lshr_b32 s5, s2, 16
+; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s5, s5, 0x7fff
+; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s4, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index a77c7ae923d0f..27cf49aec8229 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_and_b32 s0, s3, 0x7fff7fff
-; CI-NEXT: s_and_b32 s1, s2, 0x7fff7fff
-; CI-NEXT: v_mov_b32_e32 v2, s1
-; CI-NEXT: v_mov_b32_e32 v3, s0
-; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_fabs_v4f16:
@@ -234,13 +234,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_and_b32 s0, s3, 0x7fff7fff
-; VI-NEXT: s_and_b32 s1, s2, 0x7fff7fff
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
+; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_fabs_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index baf9b0abf7b0c..97e23fcdb2263 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -99,29 +99,28 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-LABEL: fabs_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_and_b32 s0, s3, 0x7fffffff
-; SI-NEXT: s_and_b32 s1, s2, 0x7fffffff
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_mov_b32_e32 v1, s0
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_bitset0_b32 s5, 31
+; SI-NEXT: s_bitset0_b32 s4, 31
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fabs_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
-; VI-NEXT: s_and_b32 s1, s2, 0x7fffffff
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_bitset0_b32 s3, 31
+; VI-NEXT: s_bitset0_b32 s2, 31
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
store <2 x float> %fabs, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 1d87d938cc41c..0a2e758f7cf21 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -472,52 +472,50 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out,
define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) {
; SI-LABEL: s_test_copysign_v2f32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_brev_b32 s8, -2
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_brev_b32 s0, -2
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_mov_b32_e32 v1, s9
-; SI-NEXT: v_bfi_b32 v1, s0, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v2, s8
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_bfi_b32 v0, s0, v0, v2
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_bfi_b32 v1, s8, v0, v1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_bfi_b32 v0, s8, v0, v2
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_test_copysign_v2f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_brev_b32 s6, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_bfi_b32 v1, s6, v0, v1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_bfi_b32 v0, s6, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_bfi_b32 v3, s6, v2, v3
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_bfi_b32 v2, s6, v2, v4
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
-; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
%result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
store <2 x float> %result, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 0fc61cbe54bad..c510c40c8536c 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -932,18 +932,16 @@ entry:
define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
; GFX6-FASTFMA-LABEL: s_fdiv_v2f32:
; GFX6-FASTFMA: ; %bb.0: ; %entry
-; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1
+; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1
; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-FASTFMA-NEXT: s_mov_b32 s4, s0
-; GFX6-FASTFMA-NEXT: s_mov_b32 s5, s1
-; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1
+; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s9
-; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3
+; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s11
+; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3
@@ -952,13 +950,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v3, v4
; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v2, v4, v0
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s8
; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v3, v4
-; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2
+; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s9, v1
-; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2
+; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s11, v1
+; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s10
+; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4
@@ -968,21 +966,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v3, v5, v0
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v4, v5
-; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s8, v2
-; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s10, v2
+; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-FASTFMA-NEXT: s_endpgm
;
; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32:
; GFX6-SLOWFMA: ; %bb.0: ; %entry
-; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s3
-; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0
-; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s5
-; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3
-; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s2
-; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s1
+; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0
+; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s3
+; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1
+; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s0
; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v3, 1.0
@@ -992,13 +989,14 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4
+; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4
; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5
-; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s4
-; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2
-; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1
+; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s2
+; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0
+; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1
; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2
-; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s5, v0
+; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s3, v0
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v2, v5, 1.0
; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v5, v5
@@ -1008,24 +1006,22 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5
-; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, v4
-; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s2, v4
+; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-SLOWFMA-NEXT: s_endpgm
;
; GFX7-LABEL: s_fdiv_v2f32:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s6, -1
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_mov_b32 s4, s0
-; GFX7-NEXT: s_mov_b32 s5, s1
-; GFX7-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, s9
+; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, s9
-; GFX7-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3
+; GFX7-NEXT: v_mov_b32_e32 v0, s11
+; GFX7-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3
@@ -1034,13 +1030,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX7-NEXT: v_fma_f32 v4, v5, v3, v4
; GFX7-NEXT: v_fma_f32 v0, -v2, v4, v0
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v3, v4
-; GFX7-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2
+; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2
; GFX7-NEXT: v_rcp_f32_e32 v4, v3
-; GFX7-NEXT: v_div_fixup_f32 v1, v0, s9, v1
-; GFX7-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2
+; GFX7-NEXT: v_div_fixup_f32 v1, v0, s11, v1
+; GFX7-NEXT: v_mov_b32_e32 v0, s10
+; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0
; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4
@@ -1050,20 +1046,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX7-NEXT: v_fma_f32 v0, -v3, v5, v0
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v4, v5
-; GFX7-NEXT: v_div_fixup_f32 v0, v0, s8, v2
-; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7-NEXT: v_div_fixup_f32 v0, v0, s10, v2
+; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_fdiv_v2f32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_rcp_f32_e32 v3, v1
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX8-NEXT: v_fma_f32 v5, -v1, v3, 1.0
@@ -1073,12 +1068,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4
+; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4
; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-NEXT: v_div_fixup_f32 v1, v1, s5, v0
+; GFX8-NEXT: v_div_fixup_f32 v1, v1, s3, v0
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX8-NEXT: v_fma_f32 v0, -v2, v5, 1.0
; GFX8-NEXT: v_fma_f32 v0, v0, v5, v5
@@ -1088,20 +1084,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_div_fixup_f32 v0, v0, s4, v4
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_div_fixup_f32 v0, v0, s2, v4
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_v2f32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s4, s7, s7, s3
-; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s7, s3
+; GFX10-NEXT: v_div_scale_f32 v0, s6, s3, s3, s1
+; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
@@ -1111,11 +1106,12 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX10-NEXT: s_denorm_mode 12
-; GFX10-NEXT: v_div_scale_f32 v2, s4, s6, s6, s2
+; GFX10-NEXT: v_div_scale_f32 v2, s6, s2, s2, s0
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
-; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s3
-; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s6, s2
+; GFX10-NEXT: v_div_fixup_f32 v1, v0, s3, s1
+; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3
@@ -1126,18 +1122,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4
-; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s2
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, s0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s3
-; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s5, s3
+; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s1
+; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1
; GFX11-NEXT: v_rcp_f32_e32 v1, v0
; GFX11-NEXT: s_denorm_mode 15
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
@@ -1148,11 +1145,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX11-NEXT: s_denorm_mode 12
-; GFX11-NEXT: v_div_scale_f32 v2, null, s4, s4, s2
+; GFX11-NEXT: v_div_scale_f32 v2, null, s2, s2, s0
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX11-NEXT: v_rcp_f32_e32 v3, v2
-; GFX11-NEXT: v_div_fixup_f32 v1, v0, s5, s3
-; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s4, s2
+; GFX11-NEXT: v_div_fixup_f32 v1, v0, s3, s1
+; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0
; GFX11-NEXT: s_denorm_mode 15
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0
@@ -1164,8 +1161,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX11-NEXT: s_denorm_mode 12
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4
-; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; EG-LABEL: s_fdiv_v2f32:
@@ -1190,60 +1187,58 @@ entry:
define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
; GFX67-LABEL: s_fdiv_ulp25_v2f32:
; GFX67: ; %bb.0: ; %entry
-; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GFX67-NEXT: s_mov_b32 s7, 0xf000
; GFX67-NEXT: s_mov_b32 s6, -1
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: v_rcp_f32_e32 v0, s8
-; GFX67-NEXT: v_rcp_f32_e32 v1, s9
-; GFX67-NEXT: s_mov_b32 s4, s0
-; GFX67-NEXT: s_mov_b32 s5, s1
-; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX67-NEXT: v_mul_f32_e32 v1, s3, v1
+; GFX67-NEXT: v_rcp_f32_e32 v0, s2
+; GFX67-NEXT: v_rcp_f32_e32 v1, s3
+; GFX67-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX67-NEXT: v_mul_f32_e32 v1, s1, v1
; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX67-NEXT: s_endpgm
;
; GFX8-LABEL: s_fdiv_ulp25_v2f32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f32_e32 v2, s6
-; GFX8-NEXT: v_rcp_f32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mul_f32_e32 v2, s2, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, s3, v3
-; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: v_rcp_f32_e32 v0, s2
+; GFX8-NEXT: v_rcp_f32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, s1, v1
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_ulp25_v2f32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f32_e32 v0, s6
-; GFX10-NEXT: v_rcp_f32_e32 v1, s7
-; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, s3, v1
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: v_rcp_f32_e32 v0, s2
+; GFX10-NEXT: v_rcp_f32_e32 v1, s3
+; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, s1, v1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_ulp25_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rcp_f32_e32 v0, s6
-; GFX11-NEXT: v_rcp_f32_e32 v1, s7
+; GFX11-NEXT: v_rcp_f32_e32 v0, s2
+; GFX11-NEXT: v_rcp_f32_e32 v1, s3
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; EG-LABEL: s_fdiv_ulp25_v2f32:
@@ -1268,60 +1263,58 @@ entry:
define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
; GFX67-LABEL: s_fdiv_v2f32_fast_math:
; GFX67: ; %bb.0: ; %entry
-; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GFX67-NEXT: s_mov_b32 s7, 0xf000
; GFX67-NEXT: s_mov_b32 s6, -1
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: v_rcp_f32_e32 v0, s9
-; GFX67-NEXT: v_rcp_f32_e32 v2, s8
-; GFX67-NEXT: s_mov_b32 s4, s0
-; GFX67-NEXT: s_mov_b32 s5, s1
-; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0
-; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2
+; GFX67-NEXT: v_rcp_f32_e32 v0, s3
+; GFX67-NEXT: v_rcp_f32_e32 v2, s2
+; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0
+; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2
; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX67-NEXT: s_endpgm
;
; GFX8-LABEL: s_fdiv_v2f32_fast_math:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f32_e32 v2, s7
-; GFX8-NEXT: v_rcp_f32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2
-; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4
-; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: v_rcp_f32_e32 v0, s3
+; GFX8-NEXT: v_rcp_f32_e32 v2, s2
+; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_v2f32_fast_math:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f32_e32 v0, s7
-; GFX10-NEXT: v_rcp_f32_e32 v2, s6
-; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2
-; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX10-NEXT: v_rcp_f32_e32 v0, s3
+; GFX10-NEXT: v_rcp_f32_e32 v2, s2
+; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2
+; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_v2f32_fast_math:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rcp_f32_e32 v0, s7
-; GFX11-NEXT: v_rcp_f32_e32 v2, s6
+; GFX11-NEXT: v_rcp_f32_e32 v0, s3
+; GFX11-NEXT: v_rcp_f32_e32 v2, s2
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; EG-LABEL: s_fdiv_v2f32_fast_math:
@@ -1346,60 +1339,58 @@ entry:
define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
; GFX67-LABEL: s_fdiv_v2f32_arcp_math:
; GFX67: ; %bb.0: ; %entry
-; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GFX67-NEXT: s_mov_b32 s7, 0xf000
; GFX67-NEXT: s_mov_b32 s6, -1
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: v_rcp_f32_e32 v0, s9
-; GFX67-NEXT: v_rcp_f32_e32 v2, s8
-; GFX67-NEXT: s_mov_b32 s4, s0
-; GFX67-NEXT: s_mov_b32 s5, s1
-; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0
-; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2
+; GFX67-NEXT: v_rcp_f32_e32 v0, s3
+; GFX67-NEXT: v_rcp_f32_e32 v2, s2
+; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0
+; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2
; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX67-NEXT: s_endpgm
;
; GFX8-LABEL: s_fdiv_v2f32_arcp_math:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f32_e32 v2, s7
-; GFX8-NEXT: v_rcp_f32_e32 v4, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2
-; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4
-; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: v_rcp_f32_e32 v0, s3
+; GFX8-NEXT: v_rcp_f32_e32 v2, s2
+; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_v2f32_arcp_math:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f32_e32 v0, s7
-; GFX10-NEXT: v_rcp_f32_e32 v2, s6
-; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2
-; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX10-NEXT: v_rcp_f32_e32 v0, s3
+; GFX10-NEXT: v_rcp_f32_e32 v2, s2
+; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2
+; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_v2f32_arcp_math:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rcp_f32_e32 v0, s7
-; GFX11-NEXT: v_rcp_f32_e32 v2, s6
+; GFX11-NEXT: v_rcp_f32_e32 v0, s3
+; GFX11-NEXT: v_rcp_f32_e32 v2, s2
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; EG-LABEL: s_fdiv_v2f32_arcp_math:
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index 3e450b785b57b..6c2ab5fb15a20 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -121,25 +121,24 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
; SICI-LABEL: fnearbyint_v2f32:
; SICI: ; %bb.0: ; %entry
; SICI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SICI-NEXT: s_mov_b32 s7, 0xf000
-; SICI-NEXT: s_mov_b32 s6, -1
; SICI-NEXT: s_waitcnt lgkmcnt(0)
-; SICI-NEXT: s_mov_b32 s4, s0
-; SICI-NEXT: s_mov_b32 s5, s1
-; SICI-NEXT: v_rndne_f32_e32 v1, s3
-; SICI-NEXT: v_rndne_f32_e32 v0, s2
-; SICI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SICI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SICI-NEXT: s_mov_b32 s3, 0xf000
+; SICI-NEXT: s_mov_b32 s2, -1
+; SICI-NEXT: v_rndne_f32_e32 v1, s5
+; SICI-NEXT: v_rndne_f32_e32 v0, s4
+; SICI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SICI-NEXT: s_endpgm
;
; VI-LABEL: fnearbyint_v2f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_rndne_f32_e32 v3, s3
-; VI-NEXT: v_rndne_f32_e32 v2, s2
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_rndne_f32_e32 v1, s3
+; VI-NEXT: v_rndne_f32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fnearbyint_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 468df77f5c2aa..5424ebfcffcd1 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -624,13 +624,13 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_or_b32 s0, s3, 0x80008000
-; CI-NEXT: s_or_b32 s1, s2, 0x80008000
-; CI-NEXT: v_mov_b32_e32 v2, s1
-; CI-NEXT: v_mov_b32_e32 v3, s0
-; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CI-NEXT: s_or_b32 s3, s3, 0x80008000
+; CI-NEXT: s_or_b32 s2, s2, 0x80008000
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
; VI-LABEL: fneg_fabs_v4bf16:
@@ -640,25 +640,23 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_and_b32 s0, s2, 0x7fff7fff
-; VI-NEXT: s_and_b32 s1, s3, 0x7fff7fff
-; VI-NEXT: s_bfe_u32 s3, s3, 0xf0010
-; VI-NEXT: s_bfe_u32 s2, s2, 0xf0010
-; VI-NEXT: s_xor_b32 s1, s1, 0x8000
-; VI-NEXT: s_xor_b32 s3, s3, 0x8000
-; VI-NEXT: s_xor_b32 s0, s0, 0x8000
-; VI-NEXT: s_xor_b32 s2, s2, 0x8000
-; VI-NEXT: s_and_b32 s1, 0xffff, s1
-; VI-NEXT: s_lshl_b32 s3, s3, 16
-; VI-NEXT: s_and_b32 s0, 0xffff, s0
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_or_b32 s1, s1, s3
-; VI-NEXT: s_or_b32 s0, s0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_lshr_b32 s4, s2, 16
+; VI-NEXT: s_lshr_b32 s5, s3, 16
+; VI-NEXT: s_bitset1_b32 s3, 15
+; VI-NEXT: s_bitset1_b32 s2, 15
+; VI-NEXT: s_bitset1_b32 s5, 15
+; VI-NEXT: s_bitset1_b32 s4, 15
+; VI-NEXT: s_and_b32 s3, 0xffff, s3
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: s_and_b32 s2, 0xffff, s2
+; VI-NEXT: s_lshl_b32 s4, s4, 16
+; VI-NEXT: s_or_b32 s3, s3, s5
+; VI-NEXT: s_or_b32 s2, s2, s4
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fneg_fabs_v4bf16:
@@ -666,16 +664,14 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff7fff
-; GFX9-NEXT: s_and_b32 s5, s3, 0x7fff7fff
-; GFX9-NEXT: s_bfe_u32 s3, s3, 0xf0010
-; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf0010
-; GFX9-NEXT: s_xor_b32 s3, s3, 0x8000
-; GFX9-NEXT: s_xor_b32 s5, s5, 0x8000
-; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000
-; GFX9-NEXT: s_xor_b32 s4, s4, 0x8000
-; GFX9-NEXT: s_pack_ll_b32_b16 s3, s5, s3
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2
+; GFX9-NEXT: s_lshr_b32 s4, s2, 16
+; GFX9-NEXT: s_lshr_b32 s5, s3, 16
+; GFX9-NEXT: s_bitset1_b32 s3, 15
+; GFX9-NEXT: s_bitset1_b32 s2, 15
+; GFX9-NEXT: s_bitset1_b32 s5, 15
+; GFX9-NEXT: s_bitset1_b32 s4, 15
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s5
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -685,16 +681,14 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s4, s2, 0x7fff7fff
-; GFX11-NEXT: s_and_b32 s5, s3, 0x7fff7fff
-; GFX11-NEXT: s_bfe_u32 s3, s3, 0xf0010
-; GFX11-NEXT: s_bfe_u32 s2, s2, 0xf0010
-; GFX11-NEXT: s_xor_b32 s3, s3, 0x8000
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000
-; GFX11-NEXT: s_xor_b32 s4, s4, 0x8000
-; GFX11-NEXT: s_xor_b32 s5, s5, 0x8000
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s3
+; GFX11-NEXT: s_lshr_b32 s4, s2, 16
+; GFX11-NEXT: s_lshr_b32 s5, s3, 16
+; GFX11-NEXT: s_bitset1_b32 s3, 15
+; GFX11-NEXT: s_bitset1_b32 s2, 15
+; GFX11-NEXT: s_bitset1_b32 s4, 15
+; GFX11-NEXT: s_bitset1_b32 s5, 15
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 305f4e56184cc..9d9a851a5507e 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -516,13 +516,13 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: v_mov_b32_e32 v0, s0
-; CIVI-NEXT: v_mov_b32_e32 v1, s1
-; CIVI-NEXT: s_or_b32 s0, s3, 0x80008000
-; CIVI-NEXT: s_or_b32 s1, s2, 0x80008000
-; CIVI-NEXT: v_mov_b32_e32 v2, s1
-; CIVI-NEXT: v_mov_b32_e32 v3, s0
-; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
+; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
+; CIVI-NEXT: v_mov_b32_e32 v3, s1
+; CIVI-NEXT: v_mov_b32_e32 v0, s2
+; CIVI-NEXT: v_mov_b32_e32 v1, s3
+; CIVI-NEXT: v_mov_b32_e32 v2, s0
+; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CIVI-NEXT: s_endpgm
;
; GFX9-LABEL: fneg_fabs_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index b93a598cb52ae..214ccedd75170 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -199,29 +199,28 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
; SI-LABEL: fneg_fabsf_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bitset1_b32 s3, 31
-; SI-NEXT: s_bitset1_b32 s2, 31
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_bitset1_b32 s5, 31
+; SI-NEXT: s_bitset1_b32 s4, 31
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fneg_fabsf_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_or_b32 s0, s3, 0x80000000
-; VI-NEXT: s_or_b32 s1, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_bitset1_b32 s3, 31
+; VI-NEXT: s_bitset1_b32 s2, 31
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
%fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 17225b7c39f4f..02235151a83e1 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -52,29 +52,28 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
; SI-LABEL: s_fneg_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_xor_b32 s0, s3, 0x80000000
-; SI-NEXT: s_xor_b32 s1, s2, 0x80000000
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_mov_b32_e32 v1, s0
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: s_xor_b32 s5, s5, 0x80000000
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fneg_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_xor_b32 s0, s3, 0x80000000
-; VI-NEXT: s_xor_b32 s1, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_xor_b32 s3, s3, 0x80000000
+; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 35de9ccd99739..a2cd6d28e96cb 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -134,27 +134,24 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %
; SI-LABEL: fp_to_sint_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_cvt_i32_f32_e32 v1, s3
-; SI-NEXT: v_cvt_i32_f32_e32 v0, s2
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_cvt_i32_f32_e32 v1, s5
+; SI-NEXT: v_cvt_i32_f32_e32 v0, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_sint_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e32 v1, s3
; VI-NEXT: v_cvt_i32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_sint_v2i32:
@@ -438,26 +435,25 @@ entry:
define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) {
; SI-LABEL: fp_to_sint_v2i64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s6, 0x2f800000
+; SI-NEXT: s_mov_b32 s7, 0xcf800000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s8, 0x2f800000
-; SI-NEXT: s_mov_b32 s9, 0xcf800000
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: v_trunc_f32_e32 v0, s7
-; SI-NEXT: v_trunc_f32_e32 v1, s6
-; SI-NEXT: v_mul_f32_e64 v2, |v0|, s8
+; SI-NEXT: v_trunc_f32_e32 v0, s5
+; SI-NEXT: v_trunc_f32_e32 v1, s4
+; SI-NEXT: v_mul_f32_e64 v2, |v0|, s6
; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
-; SI-NEXT: v_mul_f32_e64 v4, |v1|, s8
+; SI-NEXT: v_mul_f32_e64 v4, |v1|, s6
; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v1
; SI-NEXT: v_floor_f32_e32 v2, v2
; SI-NEXT: v_floor_f32_e32 v4, v4
; SI-NEXT: v_cvt_u32_f32_e32 v6, v2
-; SI-NEXT: v_fma_f32 v0, v2, s9, |v0|
+; SI-NEXT: v_fma_f32 v0, v2, s7, |v0|
; SI-NEXT: v_cvt_u32_f32_e32 v2, v4
-; SI-NEXT: v_fma_f32 v1, v4, s9, |v1|
+; SI-NEXT: v_fma_f32 v1, v4, s7, |v1|
; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
; SI-NEXT: v_xor_b32_e32 v4, v6, v3
; SI-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -474,36 +470,35 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
; VI-LABEL: fp_to_sint_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s8, 0x2f800000
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s6, 0x2f800000
+; VI-NEXT: s_mov_b32 s7, 0xcf800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_trunc_f32_e32 v0, s3
-; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8
-; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; VI-NEXT: v_trunc_f32_e32 v0, s5
+; VI-NEXT: v_mul_f32_e64 v1, |v0|, s6
; VI-NEXT: v_floor_f32_e32 v1, v1
-; VI-NEXT: s_mov_b32 s0, 0xcf800000
-; VI-NEXT: v_fma_f32 v2, v1, s0, |v0|
-; VI-NEXT: v_trunc_f32_e32 v4, s2
-; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
-; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8
-; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
-; VI-NEXT: v_floor_f32_e32 v3, v3
-; VI-NEXT: v_cvt_u32_f32_e32 v5, v3
-; VI-NEXT: v_fma_f32 v3, v3, s0, |v4|
+; VI-NEXT: v_cvt_u32_f32_e32 v2, v1
+; VI-NEXT: v_fma_f32 v1, v1, s7, |v0|
; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0
-; VI-NEXT: v_cvt_u32_f32_e32 v6, v3
-; VI-NEXT: v_xor_b32_e32 v2, v2, v0
+; VI-NEXT: v_trunc_f32_e32 v4, s4
+; VI-NEXT: v_xor_b32_e32 v3, v2, v0
+; VI-NEXT: v_mul_f32_e64 v2, |v4|, s6
+; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
+; VI-NEXT: v_floor_f32_e32 v2, v2
+; VI-NEXT: v_cvt_u32_f32_e32 v5, v2
+; VI-NEXT: v_fma_f32 v2, v2, s7, |v4|
+; VI-NEXT: v_cvt_u32_f32_e32 v6, v2
; VI-NEXT: v_xor_b32_e32 v1, v1, v0
-; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
-; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v0, vcc
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v1, v0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v4
+; VI-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc
; VI-NEXT: v_xor_b32_e32 v0, v6, v1
; VI-NEXT: v_xor_b32_e32 v4, v5, v1
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_sint_v2i64:
@@ -1298,32 +1293,29 @@ define amdgpu_kernel void @fp_to_sint_v2f32_to_v2i16(ptr addrspace(1) %out, <2 x
; SI-LABEL: fp_to_sint_v2f32_to_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_cvt_i32_f32_e32 v0, s3
-; SI-NEXT: v_cvt_i32_f32_e32 v1, s2
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_cvt_i32_f32_e32 v0, s5
+; SI-NEXT: v_cvt_i32_f32_e32 v1, s4
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_sint_v2f32_to_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_i32_f32_e32 v1, s2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_sint_v2f32_to_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index 106d1116c2bc6..32f80ff6c22f8 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -72,27 +72,24 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x
; SI-LABEL: fp_to_uint_v2f32_to_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_cvt_u32_f32_e32 v1, s3
-; SI-NEXT: v_cvt_u32_f32_e32 v0, s2
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_cvt_u32_f32_e32 v1, s5
+; SI-NEXT: v_cvt_u32_f32_e32 v0, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_v2f32_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v1, s3
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i32:
@@ -349,32 +346,29 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
; SI-LABEL: fp_to_uint_v2f32_to_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s8, 0xcf800000
+; SI-NEXT: s_mov_b32 s6, 0xcf800000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_trunc_f32_e32 v0, s3
-; SI-NEXT: v_trunc_f32_e32 v2, s2
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_trunc_f32_e32 v0, s5
+; SI-NEXT: v_trunc_f32_e32 v2, s4
; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; SI-NEXT: v_floor_f32_e32 v4, v1
; SI-NEXT: v_floor_f32_e32 v5, v3
; SI-NEXT: v_cvt_u32_f32_e32 v3, v4
; SI-NEXT: v_cvt_u32_f32_e32 v1, v5
-; SI-NEXT: v_fma_f32 v0, v4, s8, v0
-; SI-NEXT: v_fma_f32 v4, v5, s8, v2
+; SI-NEXT: v_fma_f32 v0, v4, s6, v0
+; SI-NEXT: v_fma_f32 v4, v5, s6, v2
; SI-NEXT: v_cvt_u32_f32_e32 v2, v0
; SI-NEXT: v_cvt_u32_f32_e32 v0, v4
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_v2f32_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s3
; VI-NEXT: v_trunc_f32_e32 v4, s2
@@ -389,9 +383,9 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
; VI-NEXT: v_cvt_u32_f32_e32 v3, v5
; VI-NEXT: v_cvt_u32_f32_e32 v1, v6
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i64:
@@ -1078,31 +1072,28 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i16(ptr addrspace(1) %out, <2 x
; SI-LABEL: fp_to_uint_v2f32_to_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_cvt_u32_f32_e32 v0, s3
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_cvt_u32_f32_e32 v0, s5
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_cvt_u32_f32_e32 v1, s2
+; SI-NEXT: v_cvt_u32_f32_e32 v1, s4
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_v2f32_to_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_cvt_u32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_u32_f32_e32 v1, s2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 0366d618249df..72c2003058a01 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -209,81 +209,85 @@ entry:
define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
; SI-LABEL: fshl_v2i32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s0, s5
-; SI-NEXT: s_mov_b32 s1, s3
-; SI-NEXT: s_lshr_b32 s12, s3, 1
-; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; SI-NEXT: s_not_b32 s3, s7
-; SI-NEXT: s_mov_b32 s1, s12
-; SI-NEXT: s_and_b32 s3, s3, 31
-; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
-; SI-NEXT: s_mov_b32 s5, s2
-; SI-NEXT: s_lshr_b32 s1, s2, 1
-; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], 1
-; SI-NEXT: s_mov_b32 s3, s1
-; SI-NEXT: s_not_b32 s1, s6
+; SI-NEXT: s_mov_b32 s6, s3
+; SI-NEXT: s_mov_b32 s7, s1
+; SI-NEXT: s_lshr_b32 s12, s1, 1
+; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
+; SI-NEXT: s_not_b32 s1, s5
+; SI-NEXT: s_mov_b32 s7, s12
; SI-NEXT: s_and_b32 s1, s1, 31
-; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s1
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_mov_b32 s3, s0
+; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1
+; SI-NEXT: s_lshr_b32 s5, s0, 1
+; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
+; SI-NEXT: s_not_b32 s2, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_and_b32 s2, s2, 31
+; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s6
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshl_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_mov_b32 s0, s5
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_lshr_b32 s8, s3, 1
-; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; VI-NEXT: s_not_b32 s3, s7
-; VI-NEXT: s_mov_b32 s1, s8
-; VI-NEXT: s_and_b32 s3, s3, 31
-; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
-; VI-NEXT: s_mov_b32 s5, s2
-; VI-NEXT: s_lshr_b32 s1, s2, 1
-; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 1
-; VI-NEXT: s_mov_b32 s3, s1
-; VI-NEXT: s_not_b32 s1, s6
+; VI-NEXT: s_mov_b32 s8, s3
+; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_lshr_b32 s10, s1, 1
+; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 1
+; VI-NEXT: s_not_b32 s1, s5
+; VI-NEXT: s_mov_b32 s9, s10
; VI-NEXT: s_and_b32 s1, s1, 31
-; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b32 s3, s0
+; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1
+; VI-NEXT: s_lshr_b32 s5, s0, 1
+; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
+; VI-NEXT: s_not_b32 s2, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_and_b32 s2, s2, 31
+; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s8
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshl_v2i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s0, s13
-; GFX9-NEXT: s_mov_b32 s1, s11
-; GFX9-NEXT: s_lshr_b32 s2, s11, 1
-; GFX9-NEXT: s_not_b32 s3, s15
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT: s_mov_b32 s1, s2
-; GFX9-NEXT: s_and_b32 s2, s3, 31
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX9-NEXT: s_mov_b32 s13, s10
-; GFX9-NEXT: s_lshr_b32 s1, s10, 1
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[12:13], 1
-; GFX9-NEXT: s_mov_b32 s3, s1
-; GFX9-NEXT: s_not_b32 s1, s14
+; GFX9-NEXT: s_mov_b32 s4, s3
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_lshr_b32 s10, s1, 1
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
+; GFX9-NEXT: s_not_b32 s1, s9
+; GFX9-NEXT: s_mov_b32 s5, s10
; GFX9-NEXT: s_and_b32 s1, s1, 31
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s1
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1
+; GFX9-NEXT: s_mov_b32 s3, s0
+; GFX9-NEXT: s_lshr_b32 s5, s0, 1
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
+; GFX9-NEXT: s_not_b32 s2, s8
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_and_b32 s2, s2, 31
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v2i32:
@@ -306,24 +310,27 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
;
; GFX10-LABEL: fshl_v2i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s13
-; GFX10-NEXT: s_mov_b32 s1, s11
-; GFX10-NEXT: s_not_b32 s2, s15
-; GFX10-NEXT: s_mov_b32 s13, s10
-; GFX10-NEXT: s_lshr_b32 s4, s11, 1
-; GFX10-NEXT: s_lshr_b32 s5, s10, 1
-; GFX10-NEXT: s_not_b32 s6, s14
-; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_and_b32 s7, s2, 31
-; GFX10-NEXT: s_lshr_b64 s[2:3], s[12:13], 1
-; GFX10-NEXT: s_and_b32 s6, s6, 31
-; GFX10-NEXT: s_mov_b32 s3, s5
-; GFX10-NEXT: s_mov_b32 s1, s4
-; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
-; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s7
+; GFX10-NEXT: s_mov_b32 s4, s3
+; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: s_lshr_b32 s10, s1, 1
+; GFX10-NEXT: s_not_b32 s7, s7
+; GFX10-NEXT: s_lshr_b32 s11, s0, 1
+; GFX10-NEXT: s_not_b32 s6, s6
+; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
+; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT: s_and_b32 s4, s7, 31
+; GFX10-NEXT: s_and_b32 s5, s6, 31
+; GFX10-NEXT: s_mov_b32 s3, s11
+; GFX10-NEXT: s_mov_b32 s1, s10
+; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5
+; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -331,27 +338,30 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
;
; GFX11-LABEL: fshl_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s5
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s5, s2
-; GFX11-NEXT: s_lshr_b32 s10, s3, 1
+; GFX11-NEXT: s_mov_b32 s8, s3
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_lshr_b32 s10, s1, 1
; GFX11-NEXT: s_not_b32 s7, s7
-; GFX11-NEXT: s_lshr_b32 s11, s2, 1
+; GFX11-NEXT: s_lshr_b32 s11, s0, 1
; GFX11-NEXT: s_not_b32 s6, s6
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], 1
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
+; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
+; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
; GFX11-NEXT: s_and_b32 s7, s7, 31
; GFX11-NEXT: s_and_b32 s6, s6, 31
-; GFX11-NEXT: s_mov_b32 s5, s11
-; GFX11-NEXT: s_mov_b32 s3, s10
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s7
+; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_mov_b32 s1, s10
+; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
+; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -362,54 +372,52 @@ entry:
define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
; SI-LABEL: fshl_v2i32_imm:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s0, s9
-; SI-NEXT: s_mov_b32 s1, s3
-; SI-NEXT: s_mov_b32 s9, s2
-; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 23
-; SI-NEXT: s_lshr_b64 s[2:3], s[8:9], 25
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_mov_b32 s8, s3
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s3, s0
+; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 23
+; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 25
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s8
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshl_v2i32_imm:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_mov_b32 s0, s5
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s5, s2
-; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], 23
-; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 25
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b32 s6, s3
+; VI-NEXT: s_mov_b32 s7, s1
+; VI-NEXT: s_mov_b32 s3, s0
+; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 23
+; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 25
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshl_v2i32_imm:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s7
-; GFX9-NEXT: s_mov_b32 s5, s3
-; GFX9-NEXT: s_mov_b32 s7, s2
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 23
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 25
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_mov_b32 s4, s3
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s3, s0
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 23
+; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 25
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v2i32_imm:
@@ -429,35 +437,35 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX10-LABEL: fshl_v2i32_imm:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, s7
-; GFX10-NEXT: s_mov_b32 s7, s2
-; GFX10-NEXT: s_mov_b32 s5, s3
-; GFX10-NEXT: s_lshr_b64 s[2:3], s[6:7], 25
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 23
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_mov_b32 s4, s3
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 25
+; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 23
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_v2i32_imm:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s7
-; GFX11-NEXT: s_mov_b32 s7, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 25
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 23
+; GFX11-NEXT: s_mov_b32 s6, s3
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_mov_b32 s7, s1
+; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 25
+; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 23
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index d3ceaba111848..7afb2cf317869 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -325,56 +325,60 @@ entry:
define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
; SI-LABEL: fshr_v2i32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s0, s5
-; SI-NEXT: s_mov_b32 s1, s3
-; SI-NEXT: s_and_b32 s3, s7, 31
-; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
-; SI-NEXT: s_mov_b32 s5, s2
-; SI-NEXT: s_and_b32 s1, s6, 31
-; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s1
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_mov_b32 s6, s3
+; SI-NEXT: s_mov_b32 s7, s1
+; SI-NEXT: s_and_b32 s1, s5, 31
+; SI-NEXT: s_mov_b32 s3, s0
+; SI-NEXT: s_and_b32 s0, s4, 31
+; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1
+; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s6
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshr_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s3
+; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_and_b32 s1, s7, 31
+; VI-NEXT: s_mov_b32 s3, s0
+; VI-NEXT: s_and_b32 s0, s6, 31
+; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1
+; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_mov_b32 s0, s5
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_and_b32 s3, s7, 31
-; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
-; VI-NEXT: s_mov_b32 s5, s2
-; VI-NEXT: s_and_b32 s1, s6, 31
-; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s8
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshr_v2i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s0, s13
-; GFX9-NEXT: s_mov_b32 s1, s11
-; GFX9-NEXT: s_and_b32 s2, s15, 31
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX9-NEXT: s_mov_b32 s13, s10
-; GFX9-NEXT: s_and_b32 s1, s14, 31
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[12:13], s1
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_mov_b32 s4, s3
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_and_b32 s1, s7, 31
+; GFX9-NEXT: s_mov_b32 s3, s0
+; GFX9-NEXT: s_and_b32 s0, s6, 31
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX9-NEXT: s_endpgm
;
@@ -394,53 +398,62 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
;
; GFX10-LABEL: fshr_v2i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s13
-; GFX10-NEXT: s_mov_b32 s1, s11
-; GFX10-NEXT: s_mov_b32 s13, s10
-; GFX10-NEXT: s_and_b32 s2, s14, 31
-; GFX10-NEXT: s_and_b32 s4, s15, 31
-; GFX10-NEXT: s_lshr_b64 s[2:3], s[12:13], s2
-; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: s_mov_b32 s4, s3
+; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: s_and_b32 s0, s6, 31
+; GFX10-NEXT: s_and_b32 s6, s7, 31
+; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
+; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s6
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshr_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s5
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s5, s2
-; GFX11-NEXT: s_and_b32 s2, s6, 31
+; GFX11-NEXT: s_mov_b32 s8, s3
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_and_b32 s0, s6, 31
; GFX11-NEXT: s_and_b32 s6, s7, 31
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], s2
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[8:9], s6
+; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
+; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], s6
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fshr_v2i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s5
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s5, s2
-; GFX12-NEXT: s_and_b32 s2, s6, 31
+; GFX12-NEXT: s_mov_b32 s8, s3
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_and_b32 s0, s6, 31
; GFX12-NEXT: s_and_b32 s6, s7, 31
-; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], s2
-; GFX12-NEXT: s_lshr_b64 s[4:5], s[8:9], s6
+; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], s0
+; GFX12-NEXT: s_lshr_b64 s[2:3], s[8:9], s6
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -451,54 +464,52 @@ entry:
define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
; SI-LABEL: fshr_v2i32_imm:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s0, s9
-; SI-NEXT: s_mov_b32 s1, s3
-; SI-NEXT: s_mov_b32 s9, s2
-; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 9
-; SI-NEXT: s_lshr_b64 s[2:3], s[8:9], 7
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_mov_b32 s8, s3
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s3, s0
+; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 9
+; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s8
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshr_v2i32_imm:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_mov_b32 s0, s5
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s5, s2
-; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], 9
-; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 7
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b32 s6, s3
+; VI-NEXT: s_mov_b32 s7, s1
+; VI-NEXT: s_mov_b32 s3, s0
+; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 9
+; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 7
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshr_v2i32_imm:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s7
-; GFX9-NEXT: s_mov_b32 s5, s3
-; GFX9-NEXT: s_mov_b32 s7, s2
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 9
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 7
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_mov_b32 s4, s3
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s3, s0
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 9
+; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 7
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v2i32_imm:
@@ -518,52 +529,52 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX10-LABEL: fshr_v2i32_imm:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, s7
-; GFX10-NEXT: s_mov_b32 s7, s2
-; GFX10-NEXT: s_mov_b32 s5, s3
-; GFX10-NEXT: s_lshr_b64 s[2:3], s[6:7], 7
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 9
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_mov_b32 s4, s3
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
+; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 9
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshr_v2i32_imm:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s7
-; GFX11-NEXT: s_mov_b32 s7, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 7
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 9
+; GFX11-NEXT: s_mov_b32 s6, s3
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_mov_b32 s7, s1
+; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
+; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 9
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fshr_v2i32_imm:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s4, s7
-; GFX12-NEXT: s_mov_b32 s7, s2
-; GFX12-NEXT: s_mov_b32 s5, s3
-; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], 7
-; GFX12-NEXT: s_lshr_b64 s[4:5], s[4:5], 9
+; GFX12-NEXT: s_mov_b32 s6, s3
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_mov_b32 s7, s1
+; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], 7
+; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], 9
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
@@ -574,63 +585,61 @@ entry:
define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
; SI-LABEL: fshr_v2i32_imm_src1:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s8, 9
; SI-NEXT: s_mov_b32 s10, 7
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s0, 9
-; SI-NEXT: s_mov_b32 s1, s3
-; SI-NEXT: s_and_b32 s3, s9, 31
-; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
-; SI-NEXT: s_mov_b32 s11, s2
-; SI-NEXT: s_and_b32 s1, s8, 31
-; SI-NEXT: s_lshr_b64 s[2:3], s[10:11], s1
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_and_b32 s1, s3, 31
+; SI-NEXT: s_mov_b32 s11, s0
+; SI-NEXT: s_and_b32 s0, s2, 31
+; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1
+; SI-NEXT: s_lshr_b64 s[0:1], s[10:11], s0
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s8
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshr_v2i32_imm_src1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s6, 9
+; VI-NEXT: s_mov_b32 s8, 7
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s7, s1
+; VI-NEXT: s_and_b32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s9, s0
+; VI-NEXT: s_and_b32 s0, s2, 31
+; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1
+; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_mov_b32 s7, s3
-; VI-NEXT: s_and_b32 s0, s5, 31
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], s0
-; VI-NEXT: s_mov_b32 s6, 7
-; VI-NEXT: s_mov_b32 s7, s2
-; VI-NEXT: s_and_b32 s1, s4, 31
-; VI-NEXT: s_lshr_b64 s[2:3], s[6:7], s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshr_v2i32_imm_src1:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX9-NEXT: s_mov_b32 s4, 9
; GFX9-NEXT: s_mov_b32 s8, 7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s5, s3
-; GFX9-NEXT: s_and_b32 s3, s7, 31
-; GFX9-NEXT: s_mov_b32 s9, s2
-; GFX9-NEXT: s_and_b32 s2, s6, 31
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s3
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_and_b32 s1, s3, 31
+; GFX9-NEXT: s_mov_b32 s9, s0
+; GFX9-NEXT: s_and_b32 s0, s2, 31
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v2i32_imm_src1:
@@ -650,61 +659,61 @@ define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %
; GFX10-LABEL: fshr_v2i32_imm_src1:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: s_mov_b32 s4, 9
; GFX10-NEXT: s_mov_b32 s8, 7
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s5, s3
-; GFX10-NEXT: s_mov_b32 s9, s2
-; GFX10-NEXT: s_and_b32 s2, s6, 31
-; GFX10-NEXT: s_and_b32 s6, s7, 31
-; GFX10-NEXT: s_lshr_b64 s[2:3], s[8:9], s2
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s9, s0
+; GFX10-NEXT: s_and_b32 s0, s2, 31
+; GFX10-NEXT: s_and_b32 s2, s3, 31
+; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
+; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshr_v2i32_imm_src1:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s6, 9
; GFX11-NEXT: s_mov_b32 s8, 7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s3
-; GFX11-NEXT: s_mov_b32 s9, s2
-; GFX11-NEXT: s_and_b32 s2, s4, 31
-; GFX11-NEXT: s_and_b32 s4, s5, 31
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], s2
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[6:7], s4
+; GFX11-NEXT: s_mov_b32 s7, s1
+; GFX11-NEXT: s_mov_b32 s9, s0
+; GFX11-NEXT: s_and_b32 s0, s2, 31
+; GFX11-NEXT: s_and_b32 s2, s3, 31
+; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
+; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fshr_v2i32_imm_src1:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX12-NEXT: s_mov_b32 s6, 9
; GFX12-NEXT: s_mov_b32 s8, 7
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s7, s3
-; GFX12-NEXT: s_mov_b32 s9, s2
-; GFX12-NEXT: s_and_b32 s2, s4, 31
-; GFX12-NEXT: s_and_b32 s4, s5, 31
-; GFX12-NEXT: s_lshr_b64 s[2:3], s[8:9], s2
-; GFX12-NEXT: s_lshr_b64 s[4:5], s[6:7], s4
+; GFX12-NEXT: s_mov_b32 s7, s1
+; GFX12-NEXT: s_mov_b32 s9, s0
+; GFX12-NEXT: s_and_b32 s0, s2, 31
+; GFX12-NEXT: s_and_b32 s2, s3, 31
+; GFX12-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
+; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> <i32 7, i32 9>, <2 x i32> %y)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 580eeda73781e..da132d0269e6b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -9552,47 +9552,6 @@ define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr a
; GFX9-NEXT: s_cbranch_execnz .LBB136_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
-;
-; GFX11-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s3, s0, 0x4650
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_and_b32 s0, s3, -4
-; GFX11-NEXT: s_and_b32 s3, s3, 3
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
-; GFX11-NEXT: s_lshl_b32 s5, s3, 3
-; GFX11-NEXT: s_and_b32 s6, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s3, s2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s4, s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .LBB136_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_subrev_nc_u32_e32 v0, s4, v1
-; GFX11-NEXT: v_and_b32_e32 v0, s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, v1, s3, v0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB136_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_endpgm
%gep = getelementptr i16, ptr addrspace(1) %out, i64 9000
%val = atomicrmw sub ptr addrspace(1) %gep, i16 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
@@ -9712,47 +9671,6 @@ define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr ad
; GFX9-NEXT: s_cbranch_execnz .LBB137_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
-;
-; GFX11-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s3, s0, 0x2328
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_and_b32 s0, s3, -4
-; GFX11-NEXT: s_and_b32 s3, s3, 3
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
-; GFX11-NEXT: s_lshl_b32 s5, s3, 3
-; GFX11-NEXT: s_and_b32 s6, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s2, 0xff, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s3, s2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_lshl_b32 s4, s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .LBB137_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_subrev_nc_u32_e32 v0, s4, v1
-; GFX11-NEXT: v_and_b32_e32 v0, s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, v1, s3, v0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v0
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB137_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %out, i64 9000
%val = atomicrmw sub ptr addrspace(1) %gep, i8 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 87e57298f5dc6..8e427a6ef2023 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -98,16 +98,16 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
+; CIVI-NEXT: s_add_u32 s4, s0, 4
+; CIVI-NEXT: s_addc_u32 s5, s1, 0
+; CIVI-NEXT: v_mov_b32_e32 v2, s4
+; CIVI-NEXT: v_mov_b32_e32 v4, s3
; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: v_mov_b32_e32 v3, s5
; CIVI-NEXT: v_mov_b32_e32 v1, s1
-; CIVI-NEXT: v_mov_b32_e32 v2, s2
-; CIVI-NEXT: s_add_u32 s0, s0, 4
-; CIVI-NEXT: flat_store_dword v[0:1], v2
-; CIVI-NEXT: s_addc_u32 s1, s1, 0
-; CIVI-NEXT: v_mov_b32_e32 v0, s0
-; CIVI-NEXT: v_mov_b32_e32 v1, s1
-; CIVI-NEXT: v_mov_b32_e32 v2, s3
-; CIVI-NEXT: flat_store_short v[0:1], v2
+; CIVI-NEXT: v_mov_b32_e32 v5, s2
+; CIVI-NEXT: flat_store_short v[2:3], v4
+; CIVI-NEXT: flat_store_dword v[0:1], v5
; CIVI-NEXT: s_endpgm
;
; GFX11-LABEL: load_v3f16_arg:
@@ -135,8 +135,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
-; CIVI-NEXT: v_mov_b32_e32 v1, s1
; CIVI-NEXT: v_mov_b32_e32 v2, s2
+; CIVI-NEXT: v_mov_b32_e32 v1, s1
; CIVI-NEXT: v_mov_b32_e32 v3, s3
; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CIVI-NEXT: s_endpgm
@@ -144,9 +144,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
; GFX11-LABEL: load_v4f16_arg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
store <4 x half> %arg, ptr addrspace(1) %out
@@ -348,21 +348,37 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
}
define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
-; CIVI-LABEL: extload_v3f16_to_v3f32_arg:
-; CIVI: ; %bb.0:
-; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_lshr_b32 s4, s2, 16
-; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; CIVI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CIVI-NEXT: v_mov_b32_e32 v3, s0
-; CIVI-NEXT: v_mov_b32_e32 v4, s1
-; CIVI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
-; CIVI-NEXT: s_endpgm
+; CI-LABEL: extload_v3f16_to_v3f32_arg:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_lshr_b32 s4, s2, 16
+; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
+; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v4, s1
+; CI-NEXT: v_mov_b32_e32 v3, s0
+; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: extload_v3f16_to_v3f32_arg:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s4, s2, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; VI-NEXT: v_cvt_f32_f16_e32 v1, s4
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
+; VI-NEXT: s_endpgm
;
; GFX11-LABEL: extload_v3f16_to_v3f32_arg:
; GFX11: ; %bb.0:
@@ -370,9 +386,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s4, s2, 16
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX11-NEXT: s_endpgm
%ext = fpext <3 x half> %arg to <3 x float>
@@ -388,14 +404,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s4, s2, 16
-; CI-NEXT: s_lshr_b32 s5, s3, 16
+; CI-NEXT: s_lshr_b32 s4, s3, 16
+; CI-NEXT: s_lshr_b32 s5, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; CI-NEXT: v_cvt_f32_f16_e32 v3, s5
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
+; CI-NEXT: v_cvt_f32_f16_e32 v3, s4
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s5
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_mov_b32_e32 v5, s1
+; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
;
@@ -408,12 +424,12 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: s_lshr_b32 s5, s2, 16
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
; VI-NEXT: v_cvt_f32_f16_e32 v3, s4
; VI-NEXT: v_cvt_f32_f16_e32 v1, s5
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -424,10 +440,10 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s4, s3, 16
; GFX11-NEXT: s_lshr_b32 s5, s2, 16
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s4
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
%ext = fpext <4 x half> %arg to <4 x float>
@@ -708,33 +724,61 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
}
define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
-; CIVI-LABEL: extload_v4f16_to_v4f64_arg:
-; CIVI: ; %bb.0:
-; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CIVI-NEXT: s_add_i32 s12, s12, s17
-; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_lshr_b32 s5, s3, 16
-; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s5
-; CIVI-NEXT: s_lshr_b32 s4, s2, 16
-; CIVI-NEXT: v_cvt_f32_f16_e32 v4, s2
-; CIVI-NEXT: v_cvt_f32_f16_e32 v6, s4
-; CIVI-NEXT: s_add_u32 s2, s0, 16
-; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; CIVI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; CIVI-NEXT: s_addc_u32 s3, s1, 0
-; CIVI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
-; CIVI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; CIVI-NEXT: v_mov_b32_e32 v9, s3
-; CIVI-NEXT: v_mov_b32_e32 v8, s2
-; CIVI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
-; CIVI-NEXT: s_nop 0
-; CIVI-NEXT: v_mov_b32_e32 v0, s0
-; CIVI-NEXT: v_mov_b32_e32 v1, s1
-; CIVI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; CIVI-NEXT: s_endpgm
+; CI-LABEL: extload_v4f16_to_v4f64_arg:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_lshr_b32 s4, s3, 16
+; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
+; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
+; CI-NEXT: s_lshr_b32 s5, s2, 16
+; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
+; CI-NEXT: v_cvt_f32_f16_e32 v6, s5
+; CI-NEXT: s_add_u32 s2, s0, 16
+; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; CI-NEXT: s_addc_u32 s3, s1, 0
+; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; CI-NEXT: v_mov_b32_e32 v9, s3
+; CI-NEXT: v_mov_b32_e32 v8, s2
+; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; CI-NEXT: s_nop 0
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: extload_v4f16_to_v4f64_arg:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s5, s3, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s3
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s5
+; VI-NEXT: s_lshr_b32 s4, s2, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v4, s2
+; VI-NEXT: v_cvt_f32_f16_e32 v6, s4
+; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: v_mov_b32_e32 v8, s2
+; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; VI-NEXT: s_endpgm
;
; GFX11-LABEL: extload_v4f16_to_v4f64_arg:
; GFX11: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 11826aa0b360d..e1b4cad370f96 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -290,19 +290,19 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec,
; GCN-LABEL: half4_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT: s_load_dword s5, s[4:5], 0x34
+; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
+; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
+; GCN-NEXT: s_lshl_b32 s6, s6, 4
+; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: s_lshl_b32 s0, s5, 4
-; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
-; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
-; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
@@ -418,19 +418,19 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec,
; GCN-LABEL: short4_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT: s_load_dword s5, s[4:5], 0x34
+; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
; GCN-NEXT: s_mov_b32 s4, 0x10001
+; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
+; GCN-NEXT: s_lshl_b32 s6, s6, 4
+; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: s_lshl_b32 s0, s5, 4
-; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
-; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
-; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
@@ -443,18 +443,18 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
; GCN-LABEL: byte8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x34
+; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_xor_b32 s5, s3, 0x1010101
+; GCN-NEXT: s_lshl_b32 s6, s6, 3
+; GCN-NEXT: s_xor_b32 s4, s2, 0x1010101
+; GCN-NEXT: s_lshl_b64 s[6:7], 0xff, s6
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: s_lshl_b32 s4, s4, 3
+; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: s_xor_b32 s1, s3, 0x1010101
-; GCN-NEXT: s_xor_b32 s0, s2, 0x1010101
-; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4
-; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index e98d04556649a..7cbf9aeacfe48 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1571,13 +1571,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_lshl_b32 s0, s8, 4
-; VI-NEXT: s_mov_b32 s8, 0x50005
-; VI-NEXT: s_mov_b32 s9, s8
+; VI-NEXT: s_mov_b32 s0, 0x50005
; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
-; VI-NEXT: s_xor_b64 s[8:9], s[2:3], s[8:9]
-; VI-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1]
+; VI-NEXT: s_mov_b32 s1, s0
+; VI-NEXT: s_lshl_b32 s8, s8, 4
+; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT: s_lshl_b64 s[8:9], 0xffff, s8
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9]
; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 41b5103b38e50..a2da8876472ab 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -1000,16 +1000,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s4, s0, 4
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_add_u32 s0, s0, 4
-; VI-NEXT: flat_store_dword v[0:1], v2
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: v_mov_b32_e32 v5, s2
+; VI-NEXT: flat_store_short v[2:3], v4
+; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v3i16_arg:
@@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 23c5a079c5c6e..ab0000f6831b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -139,26 +139,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -183,51 +177,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT: v_mov_b32_e32 v4, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: v_mov_b32_e32 v4, s12
-; HEURRC-NEXT: v_mov_b32_e32 v5, s13
-; HEURRC-NEXT: v_mov_b32_e32 v6, s14
-; HEURRC-NEXT: v_mov_b32_e32 v7, s15
-; HEURRC-NEXT: v_mov_b32_e32 v8, s0
-; HEURRC-NEXT: v_mov_b32_e32 v9, s1
-; HEURRC-NEXT: v_mov_b32_e32 v10, s2
-; HEURRC-NEXT: v_mov_b32_e32 v11, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
; AGPR: ; %bb.0:
@@ -276,26 +258,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -320,51 +296,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT: v_mov_b32_e32 v4, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: v_mov_b32_e32 v4, s12
-; HEURRC-NEXT: v_mov_b32_e32 v5, s13
-; HEURRC-NEXT: v_mov_b32_e32 v6, s14
-; HEURRC-NEXT: v_mov_b32_e32 v7, s15
-; HEURRC-NEXT: v_mov_b32_e32 v8, s0
-; HEURRC-NEXT: v_mov_b32_e32 v9, s1
-; HEURRC-NEXT: v_mov_b32_e32 v10, s2
-; HEURRC-NEXT: v_mov_b32_e32 v11, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
; AGPR: ; %bb.0:
@@ -5455,76 +5419,58 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: v_mov_b32_e32 v4, s12
-; GCN-NEXT: v_mov_b32_e32 v5, s13
-; GCN-NEXT: v_mov_b32_e32 v6, s14
-; GCN-NEXT: v_mov_b32_e32 v7, s15
-; GCN-NEXT: v_mov_b32_e32 v8, s0
-; GCN-NEXT: v_mov_b32_e32 v9, s1
-; GCN-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NEXT: v_mov_b32_e32 v11, s3
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT: v_mov_b32_e32 v4, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: v_mov_b32_e32 v4, s12
-; HEURRC-NEXT: v_mov_b32_e32 v5, s13
-; HEURRC-NEXT: v_mov_b32_e32 v6, s14
-; HEURRC-NEXT: v_mov_b32_e32 v7, s15
-; HEURRC-NEXT: v_mov_b32_e32 v8, s0
-; HEURRC-NEXT: v_mov_b32_e32 v9, s1
-; HEURRC-NEXT: v_mov_b32_e32 v10, s2
-; HEURRC-NEXT: v_mov_b32_e32 v11, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; AGPR: ; %bb.0:
@@ -5573,76 +5519,58 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: v_mov_b32_e32 v4, s12
-; GCN-NEXT: v_mov_b32_e32 v5, s13
-; GCN-NEXT: v_mov_b32_e32 v6, s14
-; GCN-NEXT: v_mov_b32_e32 v7, s15
-; GCN-NEXT: v_mov_b32_e32 v8, s0
-; GCN-NEXT: v_mov_b32_e32 v9, s1
-; GCN-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NEXT: v_mov_b32_e32 v11, s3
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT: v_mov_b32_e32 v4, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: v_mov_b32_e32 v4, s12
-; HEURRC-NEXT: v_mov_b32_e32 v5, s13
-; HEURRC-NEXT: v_mov_b32_e32 v6, s14
-; HEURRC-NEXT: v_mov_b32_e32 v7, s15
-; HEURRC-NEXT: v_mov_b32_e32 v8, s0
-; HEURRC-NEXT: v_mov_b32_e32 v9, s1
-; HEURRC-NEXT: v_mov_b32_e32 v10, s2
-; HEURRC-NEXT: v_mov_b32_e32 v11, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; AGPR: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 9ea8771506aa2..3897a0e028334 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -339,53 +339,53 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; VI-SDAG-LABEL: s_exp_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x3fb8a000
-; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295
-; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1
-; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
-; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0
-; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3fb8a000, v8
-; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
-; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
-; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
-; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7
-; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000
+; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1
+; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2
+; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v6
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6
+; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
+; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0
+; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4
+; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
+; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5
+; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp_v2f32:
@@ -520,42 +520,41 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
;
; SI-SDAG-LABEL: s_exp_v2f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
-; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
+; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v0
; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
-; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2
+; SI-SDAG-NEXT: v_fma_f32 v4, s5, v0, -v2
; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4
+; SI-SDAG-NEXT: v_fma_f32 v4, s5, v1, v4
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
-; SI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
+; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
+; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5
-; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v5
+; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v5
; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
-; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0
+; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
; SI-SDAG-NEXT: v_add_f32_e32 v0, v7, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v5, v6
; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0
-; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v3
+; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3
; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x42b17218
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x7f800000
-; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v4
+; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v4
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v5
-; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3
+; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4
-; SI-SDAG-NEXT: s_mov_b32 s0, s4
-; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v4
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 268e1e25f766f..3928ec2dd76d3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -341,53 +341,53 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-LABEL: s_exp10_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x40549000
-; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc
-; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1
-; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
-; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0
-; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x40549000, v8
-; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
-; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
-; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
-; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7
-; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000
+; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1
+; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2
+; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v6
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x40549000, v6
+; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
+; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0
+; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4
+; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
+; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
+; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5
+; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc23369f4
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp10_v2f32:
@@ -522,42 +522,41 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
;
; SI-SDAG-LABEL: s_exp10_v2f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
-; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
+; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT: v_mul_f32_e32 v2, s5, v0
; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
-; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2
+; SI-SDAG-NEXT: v_fma_f32 v4, s5, v0, -v2
; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4
+; SI-SDAG-NEXT: v_fma_f32 v4, s5, v1, v4
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
-; SI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
+; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
+; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5
-; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v5
+; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v5
; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
-; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0
+; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
; SI-SDAG-NEXT: v_add_f32_e32 v0, v7, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v5, v6
; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc23369f4
-; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v3
+; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3
; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x421a209b
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-SDAG-NEXT: v_mov_b32_e32 v6, 0x7f800000
-; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v4
+; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v4
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v5
-; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3
+; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4
-; SI-SDAG-NEXT: s_mov_b32 s0, s4
-; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v4
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index c3f5146168033..dd44a1a35067e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -176,26 +176,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
+; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT: s_mov_b32 s4, s0
-; SI-SDAG-NEXT: s_mov_b32 s5, s1
-; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0
+; SI-SDAG-NEXT: v_add_f32_e32 v2, s5, v2
+; SI-SDAG-NEXT: v_add_f32_e32 v0, s4, v0
+; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
-; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v2, s0
-; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
-; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s0
-; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT: s_cselect_b32 s6, 0xffffffc0, 0
+; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v2, s6
+; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s4
+; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_exp2_v2f32:
@@ -225,26 +224,26 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-LABEL: s_exp2_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2fc0000
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
-; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
-; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
-; VI-SDAG-NEXT: v_add_f32_e32 v2, s2, v2
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
-; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
-; VI-SDAG-NEXT: v_ldexp_f32 v3, v4, s0
-; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
-; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
-; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, s0
-; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; VI-SDAG-NEXT: s_cselect_b32 s3, 0xffffffc0, 0
+; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, s3
+; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 74b6c75ac4948..7c06ae2f39f45 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -321,39 +321,38 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-SDAG-LABEL: s_log_v2f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
-; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf
+; SI-SDAG-NEXT: s_mov_b32 s8, 0x3f317217
; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
-; SI-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3
+; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
+; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2
+; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s5, v3
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-SDAG-NEXT: s_mov_b32 s0, s4
-; SI-SDAG-NEXT: s_mov_b32 s1, s5
-; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-SDAG-NEXT: s_mov_b32 s7, 0x3f317217
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v3
-; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
-; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, -v4
+; SI-SDAG-NEXT: s_cselect_b32 s6, 32, 0
+; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf
+; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, -v4
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4
-; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, v5
-; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; SI-SDAG-NEXT: v_fma_f32 v5, v3, s5, v5
+; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s4, v1
; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
; SI-SDAG-NEXT: v_log_f32_e32 v5, v1
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v5
-; SI-SDAG-NEXT: v_fma_f32 v3, v5, s7, -v2
-; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, v3
+; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, -v2
+; SI-SDAG-NEXT: v_fma_f32 v3, v5, s5, v3
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
@@ -406,51 +405,51 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; VI-SDAG-LABEL: s_log_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0
-; VI-SDAG-NEXT: v_log_f32_e32 v5, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
-; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3f317000, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v7
-; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
-; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT: v_log_f32_e32 v7, v3
-; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4
-; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7
-; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
+; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
+; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3
+; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v5
; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5
+; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
-; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_log_f32_e32 v5, v1
+; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2
+; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_log_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index c4fdac3ac5b0e..24e2fb4c8d9d7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -321,39 +321,38 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-SDAG-LABEL: s_log10_v2f32:
; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b
-; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf
+; SI-SDAG-NEXT: s_mov_b32 s8, 0x3e9a209a
; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0
-; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
-; SI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
-; SI-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3
+; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0
+; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0
+; SI-SDAG-NEXT: v_mov_b32_e32 v3, s2
+; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s5, v3
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-SDAG-NEXT: s_mov_b32 s0, s4
-; SI-SDAG-NEXT: s_mov_b32 s1, s5
-; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-SDAG-NEXT: s_mov_b32 s7, 0x3e9a209a
+; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec
; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v3
-; SI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
-; SI-SDAG-NEXT: v_fma_f32 v5, v3, s7, -v4
+; SI-SDAG-NEXT: s_cselect_b32 s6, 32, 0
+; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf
+; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, -v4
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, s4
-; SI-SDAG-NEXT: v_fma_f32 v5, v3, s8, v5
-; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; SI-SDAG-NEXT: v_fma_f32 v5, v3, s5, v5
+; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s4, v1
; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
; SI-SDAG-NEXT: v_log_f32_e32 v5, v1
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s9
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v5
-; SI-SDAG-NEXT: v_fma_f32 v3, v5, s7, -v2
-; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, v3
+; SI-SDAG-NEXT: v_fma_f32 v3, v5, s8, -v2
+; SI-SDAG-NEXT: v_fma_f32 v3, v5, s5, v3
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s9
; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
@@ -406,51 +405,51 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-LABEL: s_log10_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0
-; VI-SDAG-NEXT: v_log_f32_e32 v5, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
-; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3e9a2000, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v7
-; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
-; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT: v_log_f32_e32 v7, v3
-; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4
-; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7
-; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
+; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
+; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3
+; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v5
; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5
+; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5
+; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
-; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_log_f32_e32 v5, v1
+; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2
+; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_log10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 444f37059406a..e24fd1f22bfa6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -221,8 +221,6 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
@@ -238,11 +236,11 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s2, v1
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
; SI-SDAG-NEXT: v_log_f32_e32 v4, v1
-; SI-SDAG-NEXT: s_mov_b32 s4, s0
-; SI-SDAG-NEXT: s_mov_b32 s5, s1
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
; SI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0
-; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: s_log2_v2f32:
@@ -285,16 +283,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
-; VI-SDAG-NEXT: v_ldexp_f32 v0, s2, v0
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT: v_log_f32_e32 v5, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4
-; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
+; VI-SDAG-NEXT: v_log_f32_e32 v4, v1
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_log2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index 32a644ed334cc..ac6dd30283554 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -362,16 +362,15 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x
; SI-LABEL: s_test_imax_sgt_imm_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_max_i32 s0, s3, 9
-; SI-NEXT: s_max_i32 s1, s2, 9
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_mov_b32_e32 v1, s0
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: s_max_i32 s5, s5, 9
+; SI-NEXT: s_max_i32 s4, s4, 9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imax_sgt_imm_v2i32:
@@ -869,16 +868,15 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x
; SI-LABEL: s_test_umax_ugt_imm_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_max_u32 s0, s3, 23
-; SI-NEXT: s_max_u32 s1, s2, 15
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_mov_b32_e32 v1, s0
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: s_max_u32 s5, s5, 23
+; SI-NEXT: s_max_u32 s4, s4, 15
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_umax_ugt_imm_v2i32:
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index c571cfc3648e2..eff0680fe9a31 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -1074,118 +1074,118 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
;
; CI-LABEL: s_test_imin_sle_v4i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
+; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_ashr_i32 s0, s2, 16
-; CI-NEXT: s_ashr_i32 s1, s3, 16
+; CI-NEXT: s_ashr_i32 s6, s0, 16
+; CI-NEXT: s_ashr_i32 s7, s1, 16
+; CI-NEXT: s_sext_i32_i16 s0, s0
+; CI-NEXT: s_sext_i32_i16 s1, s1
+; CI-NEXT: s_ashr_i32 s8, s2, 16
+; CI-NEXT: s_ashr_i32 s9, s3, 16
; CI-NEXT: s_sext_i32_i16 s2, s2
; CI-NEXT: s_sext_i32_i16 s3, s3
-; CI-NEXT: s_ashr_i32 s6, s4, 16
-; CI-NEXT: s_ashr_i32 s7, s5, 16
-; CI-NEXT: s_sext_i32_i16 s4, s4
-; CI-NEXT: s_sext_i32_i16 s5, s5
-; CI-NEXT: s_min_i32 s1, s1, s7
-; CI-NEXT: s_min_i32 s3, s3, s5
-; CI-NEXT: s_min_i32 s0, s0, s6
-; CI-NEXT: s_min_i32 s2, s2, s4
-; CI-NEXT: s_lshl_b32 s1, s1, 16
-; CI-NEXT: s_and_b32 s3, s3, 0xffff
-; CI-NEXT: s_lshl_b32 s0, s0, 16
-; CI-NEXT: s_and_b32 s2, s2, 0xffff
-; CI-NEXT: s_or_b32 s1, s3, s1
-; CI-NEXT: s_or_b32 s0, s2, s0
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CI-NEXT: s_min_i32 s7, s7, s9
+; CI-NEXT: s_min_i32 s1, s1, s3
+; CI-NEXT: s_min_i32 s3, s6, s8
+; CI-NEXT: s_min_i32 s0, s0, s2
+; CI-NEXT: s_lshl_b32 s7, s7, 16
+; CI-NEXT: s_and_b32 s1, s1, 0xffff
+; CI-NEXT: s_lshl_b32 s3, s3, 16
+; CI-NEXT: s_and_b32 s0, s0, 0xffff
+; CI-NEXT: s_or_b32 s1, s1, s7
+; CI-NEXT: s_or_b32 s0, s0, s3
+; CI-NEXT: v_mov_b32_e32 v2, s4
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v3, s5
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_ashr_i32 s0, s5, 16
-; VI-NEXT: s_ashr_i32 s1, s3, 16
-; VI-NEXT: s_min_i32 s0, s1, s0
-; VI-NEXT: s_sext_i32_i16 s1, s5
+; VI-NEXT: s_ashr_i32 s6, s3, 16
+; VI-NEXT: s_ashr_i32 s7, s1, 16
; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_min_i32 s1, s3, s1
-; VI-NEXT: s_lshl_b32 s0, s0, 16
+; VI-NEXT: s_sext_i32_i16 s1, s1
+; VI-NEXT: s_min_i32 s6, s7, s6
+; VI-NEXT: s_min_i32 s1, s1, s3
+; VI-NEXT: s_lshl_b32 s6, s6, 16
; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_or_b32 s0, s1, s0
-; VI-NEXT: s_ashr_i32 s1, s4, 16
+; VI-NEXT: s_or_b32 s1, s1, s6
; VI-NEXT: s_ashr_i32 s3, s2, 16
-; VI-NEXT: s_min_i32 s1, s3, s1
-; VI-NEXT: s_sext_i32_i16 s3, s4
+; VI-NEXT: s_ashr_i32 s6, s0, 16
; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_min_i32 s2, s2, s3
-; VI-NEXT: s_lshl_b32 s1, s1, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_or_b32 s1, s2, s1
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_sext_i32_i16 s0, s0
+; VI-NEXT: s_min_i32 s3, s6, s3
+; VI-NEXT: s_min_i32 s0, s0, s2
+; VI-NEXT: s_lshl_b32 s3, s3, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_or_b32 s0, s0, s3
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: v_pk_min_i16 v1, s3, v0
-; GFX9-NEXT: v_pk_min_i16 v0, s2, v3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_pk_min_i16 v1, s1, v0
+; GFX9-NEXT: v_pk_min_i16 v0, s0, v3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_min_i16 v1, s3, s5
-; GFX10-NEXT: v_pk_min_i16 v0, s2, s4
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: v_pk_min_i16 v1, s1, s3
+; GFX10-NEXT: v_pk_min_i16 v0, s0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_v4i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_i16 v1, s3, s5
-; GFX11-NEXT: v_pk_min_i16 v0, s2, s4
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_pk_min_i16 v1, s1, s3
+; GFX11-NEXT: v_pk_min_i16 v0, s0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_sle_v4i16:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_pk_min_i16 v1, s3, s7
-; GFX1250-NEXT: v_pk_min_i16 v0, s2, s6
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: v_pk_min_i16 v1, s1, s3
+; GFX1250-NEXT: v_pk_min_i16 v0, s0, s2
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX1250-NEXT: s_endpgm
%cmp = icmp sle <4 x i16> %a, %b
%val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
@@ -1636,92 +1636,92 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
;
; CI-LABEL: s_test_imin_slt_v2i32:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
+; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_min_i32 s1, s1, s3
+; CI-NEXT: s_min_i32 s0, s0, s2
+; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_min_i32 s0, s3, s5
-; CI-NEXT: s_min_i32 s1, s2, s4
-; CI-NEXT: v_mov_b32_e32 v2, s1
-; CI-NEXT: v_mov_b32_e32 v3, s0
-; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CI-NEXT: v_mov_b32_e32 v3, s5
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_slt_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_min_i32 s1, s1, s3
+; VI-NEXT: s_min_i32 s0, s0, s2
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_min_i32 s0, s3, s5
-; VI-NEXT: s_min_i32 s1, s2, s4
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_slt_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_min_i32 s3, s3, s5
-; GFX9-NEXT: s_min_i32 s2, s2, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_min_i32 s1, s1, s3
+; GFX9-NEXT: s_min_i32 s0, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_slt_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_min_i32 s2, s2, s4
-; GFX10-NEXT: s_min_i32 s3, s3, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_min_i32 s0, s0, s2
+; GFX10-NEXT: s_min_i32 s1, s1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_slt_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_min_i32 s2, s2, s4
-; GFX11-NEXT: s_min_i32 s3, s3, s5
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_min_i32 s0, s0, s2
+; GFX11-NEXT: s_min_i32 s1, s1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: s_test_imin_slt_v2i32:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_min_i32 s2, s2, s6
-; GFX1250-NEXT: s_min_i32 s3, s3, s7
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s3
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_min_i32 s0, s0, s2
+; GFX1250-NEXT: s_min_i32 s1, s1, s3
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX1250-NEXT: s_endpgm
%cmp = icmp slt <2 x i32> %a, %b
%val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 6eefafa37648f..1ed024f7aed36 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -76,33 +76,19 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
;
-; GFX1250-SDAG-LABEL: fadd_v2_vs:
-; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
-; GFX1250-SDAG-NEXT: s_endpgm
-;
-; GFX1250-GISEL-LABEL: fadd_v2_vs:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
-; GFX1250-GISEL-NEXT: s_endpgm
+; GFX1250-LABEL: fadd_v2_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1377,33 +1363,19 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
;
-; GFX1250-SDAG-LABEL: fmul_v2_vs:
-; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
-; GFX1250-SDAG-NEXT: s_endpgm
-;
-; GFX1250-GISEL-LABEL: fmul_v2_vs:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
-; GFX1250-GISEL-NEXT: s_endpgm
+; GFX1250-LABEL: fmul_v2_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -3568,8 +3540,8 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX900-LABEL: fadd_fadd_fsub:
; GFX900: ; %bb.0: ; %bb
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX900-NEXT: v_mov_b32_e32 v2, 0
-; GFX900-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, s3
; GFX900-NEXT: v_add_f32_e32 v0, s1, v0
@@ -3577,14 +3549,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX900-NEXT: v_add_f32_e32 v3, s2, v0
; GFX900-NEXT: v_sub_f32_e32 v0, s0, v1
; GFX900-NEXT: v_subrev_f32_e32 v1, s3, v3
-; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX900-NEXT: s_endpgm
;
; PACKED-SDAG-LABEL: fadd_fadd_fsub:
; PACKED-SDAG: ; %bb.0: ; %bb
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; PACKED-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s3
; PACKED-SDAG-NEXT: v_add_f32_e32 v0, s1, v0
@@ -3592,7 +3564,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0
; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, v0
; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
-; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
; PACKED-SDAG-NEXT: s_endpgm
;
; GFX90A-GISEL-LABEL: fadd_fadd_fsub:
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index bfdfce12cecf7..0a1d15bf945f9 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -94,64 +94,62 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
;
; SI-LABEL: rotl_v2i32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s6, s2
-; SI-NEXT: s_mov_b32 s7, s3
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_sub_i32 s5, 32, s5
-; SI-NEXT: s_sub_i32 s4, 32, s4
-; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: v_alignbit_b32 v1, s7, s7, v0
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_alignbit_b32 v0, s6, s6, v0
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_sub_i32 s3, 32, s3
+; SI-NEXT: s_sub_i32 s2, 32, s2
+; SI-NEXT: v_mov_b32_e32 v0, s3
+; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; GFX8-LABEL: rotl_v2i32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_sub_i32 s1, 32, s5
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_sub_i32 s0, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v2
-; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: s_sub_i32 s2, 32, s2
+; GFX8-NEXT: s_sub_i32 s3, 32, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
+; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: rotl_v2i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sub_i32 s4, 32, s7
-; GFX10-NEXT: s_sub_i32 s5, 32, s6
-; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s4
-; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s5
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_sub_i32 s3, 32, s3
+; GFX10-NEXT: s_sub_i32 s2, 32, s2
+; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
+; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotl_v2i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s4, 32, s7
-; GFX11-NEXT: s_sub_i32 s5, 32, s6
-; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s4
-; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s5
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sub_i32 s3, 32, s3
+; GFX11-NEXT: s_sub_i32 s2, 32, s2
+; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
+; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
%0 = shl <2 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 938d24481aaf7..403a556688091 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -83,56 +83,54 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
;
; SI-LABEL: rotr_v2i32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s6, s2
-; SI-NEXT: s_mov_b32 s7, s3
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: v_alignbit_b32 v1, s7, s7, v0
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_alignbit_b32 v0, s6, s6, v0
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: v_mov_b32_e32 v0, s3
+; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; GFX8-LABEL: rotr_v2i32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2
-; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v4
-; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
+; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: rotr_v2i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s7
-; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s6
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
+; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotr_v2i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s5
-; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s4
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
+; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
%tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
index 401b6f20d3405..f14a5cc19774d 100644
--- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -56,8 +56,8 @@ define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 {
}
; SI-LABEL: {{^}}s_addk_v2i32_k0:
-; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
; SI: s_endpgm
; Note: dummy argument here to prevent combining of descriptor loads for %out and %b
define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index be10302c42854..76f8f484fc763 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -331,80 +331,79 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_sub_i32 s1, 0, s2
-; VI-NEXT: s_lshr_b32 s5, s2, 16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_i32 s0, 0, s3
-; VI-NEXT: s_lshr_b32 s4, s3, 16
-; VI-NEXT: s_sub_i32 s5, 0, s5
-; VI-NEXT: s_ashr_i32 s6, s2, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_sub_i32 s4, 0, s4
-; VI-NEXT: s_sext_i32_i16 s5, s5
-; VI-NEXT: s_max_i32 s1, s2, s1
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_sext_i32_i16 s2, s3
-; VI-NEXT: s_max_i32 s5, s6, s5
-; VI-NEXT: s_ashr_i32 s6, s3, 16
+; VI-NEXT: s_lshr_b32 s7, s2, 16
+; VI-NEXT: s_sub_i32 s7, 0, s7
+; VI-NEXT: s_sub_i32 s4, 0, s3
+; VI-NEXT: s_lshr_b32 s6, s3, 16
+; VI-NEXT: s_ashr_i32 s8, s2, 16
+; VI-NEXT: s_sext_i32_i16 s7, s7
+; VI-NEXT: s_sub_i32 s5, 0, s2
+; VI-NEXT: s_sub_i32 s6, 0, s6
+; VI-NEXT: s_max_i32 s7, s8, s7
+; VI-NEXT: s_ashr_i32 s8, s3, 16
; VI-NEXT: s_sext_i32_i16 s4, s4
-; VI-NEXT: s_max_i32 s0, s2, s0
-; VI-NEXT: s_max_i32 s4, s6, s4
-; VI-NEXT: s_add_i32 s0, s0, 2
-; VI-NEXT: s_lshl_b32 s2, s4, 16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_add_i32 s1, s1, 2
-; VI-NEXT: s_or_b32 s0, s2, s0
-; VI-NEXT: s_lshl_b32 s2, s5, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_or_b32 s1, s2, s1
-; VI-NEXT: s_add_i32 s0, s0, 0x20000
-; VI-NEXT: s_add_i32 s1, s1, 0x20000
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_sext_i32_i16 s3, s3
+; VI-NEXT: s_sext_i32_i16 s6, s6
+; VI-NEXT: s_sext_i32_i16 s5, s5
+; VI-NEXT: s_sext_i32_i16 s2, s2
+; VI-NEXT: s_max_i32 s3, s3, s4
+; VI-NEXT: s_max_i32 s6, s8, s6
+; VI-NEXT: s_max_i32 s2, s2, s5
+; VI-NEXT: s_add_i32 s3, s3, 2
+; VI-NEXT: s_lshl_b32 s4, s6, 16
+; VI-NEXT: s_and_b32 s3, s3, 0xffff
+; VI-NEXT: s_add_i32 s2, s2, 2
+; VI-NEXT: s_or_b32 s3, s4, s3
+; VI-NEXT: s_lshl_b32 s4, s7, 16
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_or_b32 s2, s4, s2
+; VI-NEXT: s_add_i32 s3, s3, 0x20000
+; VI-NEXT: s_add_i32 s2, s2, 0x20000
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-LABEL: s_abs_v4i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b32 s4, s0
-; CI-NEXT: s_mov_b32 s5, s1
-; CI-NEXT: s_ashr_i32 s0, s3, 16
-; CI-NEXT: s_ashr_i32 s1, s2, 16
-; CI-NEXT: s_lshr_b32 s8, s2, 16
-; CI-NEXT: s_lshr_b32 s9, s3, 16
-; CI-NEXT: s_sext_i32_i16 s10, s3
-; CI-NEXT: s_sext_i32_i16 s11, s2
-; CI-NEXT: s_sub_i32 s3, 0, s3
-; CI-NEXT: s_sub_i32 s2, 0, s2
-; CI-NEXT: s_sext_i32_i16 s3, s3
-; CI-NEXT: s_sext_i32_i16 s2, s2
+; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; CI-NEXT: s_ashr_i32 s6, s5, 16
+; CI-NEXT: s_lshr_b32 s9, s5, 16
+; CI-NEXT: s_sext_i32_i16 s10, s5
+; CI-NEXT: s_sub_i32 s5, 0, s5
+; CI-NEXT: s_ashr_i32 s7, s4, 16
+; CI-NEXT: s_lshr_b32 s8, s4, 16
+; CI-NEXT: s_sext_i32_i16 s11, s4
+; CI-NEXT: s_sext_i32_i16 s5, s5
+; CI-NEXT: s_sub_i32 s4, 0, s4
; CI-NEXT: s_sub_i32 s9, 0, s9
-; CI-NEXT: s_sub_i32 s8, 0, s8
+; CI-NEXT: s_sext_i32_i16 s4, s4
; CI-NEXT: s_sext_i32_i16 s9, s9
+; CI-NEXT: s_sub_i32 s8, 0, s8
+; CI-NEXT: s_max_i32 s5, s10, s5
; CI-NEXT: s_sext_i32_i16 s8, s8
-; CI-NEXT: s_max_i32 s2, s11, s2
-; CI-NEXT: s_max_i32 s3, s10, s3
-; CI-NEXT: s_max_i32 s1, s1, s8
-; CI-NEXT: s_max_i32 s0, s0, s9
-; CI-NEXT: s_add_i32 s3, s3, 2
-; CI-NEXT: s_add_i32 s2, s2, 2
-; CI-NEXT: s_lshl_b32 s0, s0, 16
-; CI-NEXT: s_and_b32 s3, s3, 0xffff
-; CI-NEXT: s_lshl_b32 s1, s1, 16
-; CI-NEXT: s_and_b32 s2, s2, 0xffff
-; CI-NEXT: s_or_b32 s0, s0, s3
-; CI-NEXT: s_or_b32 s1, s1, s2
-; CI-NEXT: s_add_i32 s0, s0, 0x20000
-; CI-NEXT: s_add_i32 s1, s1, 0x20000
-; CI-NEXT: v_mov_b32_e32 v0, s1
-; CI-NEXT: v_mov_b32_e32 v1, s0
-; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CI-NEXT: s_max_i32 s6, s6, s9
+; CI-NEXT: s_max_i32 s4, s11, s4
+; CI-NEXT: s_add_i32 s5, s5, 2
+; CI-NEXT: s_max_i32 s7, s7, s8
+; CI-NEXT: s_lshl_b32 s6, s6, 16
+; CI-NEXT: s_and_b32 s5, s5, 0xffff
+; CI-NEXT: s_add_i32 s4, s4, 2
+; CI-NEXT: s_or_b32 s5, s6, s5
+; CI-NEXT: s_lshl_b32 s6, s7, 16
+; CI-NEXT: s_and_b32 s4, s4, 0xffff
+; CI-NEXT: s_or_b32 s4, s6, s4
+; CI-NEXT: s_add_i32 s5, s5, 0x20000
+; CI-NEXT: s_add_i32 s4, s4, 0x20000
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, -1
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
%z0 = insertelement <4 x i16> poison, i16 0, i16 0
%z1 = insertelement <4 x i16> %z0, i16 0, i16 1
diff --git a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll
index d8f7f8d7fefcc..9b3b52012f327 100644
--- a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll
@@ -136,8 +136,7 @@ define amdgpu_kernel void @store_as4_2xi32(ptr addrspace(4) %p, <2 x i32> %v) {
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; CHECK-NEXT: s_endpgm
store <2 x i32> %v, ptr addrspace(4) %p
@@ -164,8 +163,7 @@ define amdgpu_kernel void @store_as4_2xfloat(ptr addrspace(4) %p, <2 x float> %v
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; CHECK-NEXT: s_endpgm
store <2 x float> %v, ptr addrspace(4) %p
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index fc42f476fe7d0..eaab3531824c4 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -164,102 +164,98 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
;
; GFX6-LABEL: test_udivrem_v2:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT: s_sub_i32 s0, 0, s8
-; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT: s_sub_i32 s6, 0, s2
+; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0
-; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s4, v0
-; GFX6-NEXT: s_mul_i32 s4, s4, s8
-; GFX6-NEXT: s_sub_i32 s2, s2, s4
-; GFX6-NEXT: s_sub_i32 s4, s2, s8
-; GFX6-NEXT: s_cmp_ge_u32 s2, s8
-; GFX6-NEXT: s_cselect_b32 s2, s4, s2
-; GFX6-NEXT: s_sub_i32 s4, s2, s8
-; GFX6-NEXT: s_cmp_ge_u32 s2, s8
-; GFX6-NEXT: s_cselect_b32 s2, s4, s2
-; GFX6-NEXT: s_sub_i32 s4, 0, s9
-; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1
-; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: v_readfirstlane_b32 s6, v0
+; GFX6-NEXT: s_mul_i32 s6, s6, s2
+; GFX6-NEXT: s_sub_i32 s0, s0, s6
+; GFX6-NEXT: s_sub_i32 s6, s0, s2
+; GFX6-NEXT: s_cmp_ge_u32 s0, s2
+; GFX6-NEXT: s_cselect_b32 s0, s6, s0
+; GFX6-NEXT: s_sub_i32 s6, s0, s2
+; GFX6-NEXT: s_cmp_ge_u32 s0, s2
+; GFX6-NEXT: s_cselect_b32 s0, s6, s0
+; GFX6-NEXT: s_sub_i32 s2, 0, s3
+; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: s_mul_i32 s0, s0, s9
-; GFX6-NEXT: s_sub_i32 s0, s3, s0
-; GFX6-NEXT: s_sub_i32 s1, s0, s9
-; GFX6-NEXT: s_cmp_ge_u32 s0, s9
-; GFX6-NEXT: s_cselect_b32 s0, s1, s0
-; GFX6-NEXT: s_sub_i32 s1, s0, s9
-; GFX6-NEXT: s_cmp_ge_u32 s0, s9
-; GFX6-NEXT: s_cselect_b32 s0, s1, s0
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0
+; GFX6-NEXT: v_readfirstlane_b32 s2, v0
+; GFX6-NEXT: s_mul_i32 s2, s2, s3
+; GFX6-NEXT: s_sub_i32 s1, s1, s2
+; GFX6-NEXT: s_sub_i32 s2, s1, s3
+; GFX6-NEXT: s_cmp_ge_u32 s1, s3
+; GFX6-NEXT: s_cselect_b32 s1, s2, s1
+; GFX6-NEXT: s_sub_i32 s2, s1, s3
+; GFX6-NEXT: s_cmp_ge_u32 s1, s3
+; GFX6-NEXT: s_cselect_b32 s1, s2, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: test_udivrem_v2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX8-NEXT: s_sub_i32 s0, 0, s6
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX8-NEXT: s_sub_i32 s6, 0, s2
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s3
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mul_i32 s4, s4, s6
-; GFX8-NEXT: s_sub_i32 s2, s2, s4
-; GFX8-NEXT: s_sub_i32 s4, s2, s6
-; GFX8-NEXT: s_cmp_ge_u32 s2, s6
-; GFX8-NEXT: s_cselect_b32 s2, s4, s2
-; GFX8-NEXT: s_sub_i32 s4, s2, s6
-; GFX8-NEXT: s_cmp_ge_u32 s2, s6
-; GFX8-NEXT: s_cselect_b32 s2, s4, s2
-; GFX8-NEXT: s_sub_i32 s4, 0, s7
-; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_readfirstlane_b32 s6, v0
+; GFX8-NEXT: s_mul_i32 s6, s6, s2
+; GFX8-NEXT: s_sub_i32 s0, s0, s6
+; GFX8-NEXT: s_sub_i32 s6, s0, s2
+; GFX8-NEXT: s_cmp_ge_u32 s0, s2
+; GFX8-NEXT: s_cselect_b32 s0, s6, s0
+; GFX8-NEXT: s_sub_i32 s6, s0, s2
+; GFX8-NEXT: s_cmp_ge_u32 s0, s2
+; GFX8-NEXT: s_cselect_b32 s0, s6, s0
+; GFX8-NEXT: s_sub_i32 s2, 0, s3
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
-; GFX8-NEXT: v_mul_hi_u32 v2, s3, v0
+; GFX8-NEXT: v_mul_hi_u32 v1, s1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v2
-; GFX8-NEXT: s_mul_i32 s0, s0, s7
-; GFX8-NEXT: s_sub_i32 s0, s3, s0
-; GFX8-NEXT: s_sub_i32 s1, s0, s7
-; GFX8-NEXT: s_cmp_ge_u32 s0, s7
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mul_i32 s0, s0, s3
+; GFX8-NEXT: s_sub_i32 s0, s1, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s3
+; GFX8-NEXT: s_cmp_ge_u32 s0, s3
; GFX8-NEXT: s_cselect_b32 s0, s1, s0
-; GFX8-NEXT: s_sub_i32 s1, s0, s7
-; GFX8-NEXT: s_cmp_ge_u32 s0, s7
+; GFX8-NEXT: s_sub_i32 s1, s0, s3
+; GFX8-NEXT: s_cmp_ge_u32 s0, s3
; GFX8-NEXT: s_cselect_b32 s0, s1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
%result0 = udiv <2 x i32> %x, %y
store <2 x i32> %result0, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index bd311a1054a41..983acfc2c0699 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -340,8 +340,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
diff --git a/llvm/test/Transforms/InstCombine/copy-access-metadata.ll b/llvm/test/Transforms/InstCombine/copy-access-metadata.ll
deleted file mode 100644
index c687f3796edcb..0000000000000
--- a/llvm/test/Transforms/InstCombine/copy-access-metadata.ll
+++ /dev/null
@@ -1,215 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -passes=instcombine %s | FileCheck %s
-
- at test.data = private unnamed_addr constant [8 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7], align 4
- at test.ptrdata = private unnamed_addr constant [8 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null], align 8
-
-; Verify that InstCombine copies range metadata when cloning a load as part of
-; replacing an alloca initialized via memcpy from a constant. OK
-define i32 @copy_range_metadata_after_memcpy(i64 %x) {
-; CHECK-LABEL: define i32 @copy_range_metadata_after_memcpy(
-; CHECK-SAME: i64 [[X:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
-; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !range [[RNG0:![0-9]+]]
-; CHECK-NEXT: ret i32 [[L]]
-;
-entry:
- %data = alloca [8 x i32], align 4
- call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
- %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
- %l = load i32, ptr %arrayidx, align 4, !range !0
- ret i32 %l
-}
-
-declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1)
-
-!0 = !{i32 0, i32 100}
-
-; Verify TBAA metadata on a cloned load is preserved. OK
-define i32 @copy_tbaa_metadata_after_memcpy(i64 %x, ptr %sink) {
-; CHECK-LABEL: define i32 @copy_tbaa_metadata_after_memcpy(
-; CHECK-SAME: i64 [[X:%.*]], ptr [[SINK:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
-; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[SCALAR_TYPE_TBAA1:![0-9]+]]
-; CHECK-NEXT: store i32 [[L]], ptr [[SINK]], align 4
-; CHECK-NEXT: ret i32 [[L]]
-;
-entry:
- %data = alloca [8 x i32], align 4
- call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
- %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
- %l = load i32, ptr %arrayidx, align 4, !tbaa !1
- store i32 %l, ptr %sink, align 4
- ret i32 %l
-}
-
-!1 = !{!2, !2, i64 0}
-!2 = !{!"scalar type", !3}
-!3 = !{!"root"}
-
-; Verify dereferenceable_or_null metadata on a cloned load is preserved
-; when the loaded value type is a pointer. OK
-define ptr @copy_deref_or_null_metadata_after_memcpy(i64 %x) {
-; CHECK-LABEL: define ptr @copy_deref_or_null_metadata_after_memcpy(
-; CHECK-SAME: i64 [[X:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: ret ptr null
-;
-entry:
- %data = alloca [8 x ptr], align 8
- call void @llvm.memcpy.p0.p0.i64(ptr align 8 %data, ptr align 8 @test.ptrdata, i64 64, i1 false)
- %arrayidx = getelementptr inbounds [8 x ptr], ptr %data, i64 0, i64 %x
- %l = load ptr, ptr %arrayidx, align 8, !dereferenceable_or_null !4
- ret ptr %l
-}
-
-!4 = !{i64 8}
-
-; Verify nonnull metadata on a cloned load is preserved
-; when the loaded value type is a pointer. OK
-define ptr @copy_nonnull_metadata_after_memcpy(i64 %x) {
-; CHECK-LABEL: define ptr @copy_nonnull_metadata_after_memcpy(
-; CHECK-SAME: i64 [[X:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: ret ptr null
-;
-entry:
- %data = alloca [8 x ptr], align 8
- call void @llvm.memcpy.p0.p0.i64(ptr align 8 %data, ptr align 8 @test.ptrdata, i64 64, i1 false)
- %arrayidx = getelementptr inbounds [8 x ptr], ptr %data, i64 0, i64 %x
- %l = load ptr, ptr %arrayidx, align 8, !nonnull !5
- ret ptr %l
-}
-
-!5 = !{}
-
-; Verify invariant.load metadata on a cloned load is preserved. OK
-define i32 @copy_invariant_load_metadata_after_memcpy(i64 %x) {
-; CHECK-LABEL: define i32 @copy_invariant_load_metadata_after_memcpy(
-; CHECK-SAME: i64 [[X:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
-; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !invariant.load [[META4:![0-9]+]]
-; CHECK-NEXT: ret i32 [[L]]
-;
-entry:
- %data = alloca [8 x i32], align 4
- call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
- %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
- %l = load i32, ptr %arrayidx, align 4, !invariant.load !5
- ret i32 %l
-}
-
-; Verify alias.scope and noalias metadata on a cloned load are preserved. OK
-define i32 @copy_aliasscope_noalias_metadata_after_memcpy(i64 %x) {
-; CHECK-LABEL: define i32 @copy_aliasscope_noalias_metadata_after_memcpy(
-; CHECK-SAME: i64 [[X:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
-; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META5]]
-; CHECK-NEXT: ret i32 [[L]]
-;
-entry:
- %data = alloca [8 x i32], align 4
- call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
- %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
- %l = load i32, ptr %arrayidx, align 4, !alias.scope !6, !noalias !6
- ret i32 %l
-}
-
-; Verify nontemporal metadata on a cloned load is preserved.OK
-define i32 @copy_nontemporal_metadata_after_memcpy(i64 %x) {
-; CHECK-LABEL: define i32 @copy_nontemporal_metadata_after_memcpy(
-; CHECK-SAME: i64 [[X:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
-; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !nontemporal [[META8:![0-9]+]]
-; CHECK-NEXT: ret i32 [[L]]
-;
-entry:
- %data = alloca [8 x i32], align 4
- call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
- %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
- %l = load i32, ptr %arrayidx, align 4, !nontemporal !9
- ret i32 %l
-}
-
-; Verify access group metadata on a cloned load is preserved. OK
-define i32 @copy_access_group_metadata_after_memcpy(i64 %x) {
-; CHECK-LABEL: define i32 @copy_access_group_metadata_after_memcpy(
-; CHECK-SAME: i64 [[X:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
-; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]]
-; CHECK-NEXT: ret i32 [[L]]
-;
-entry:
- %data = alloca [8 x i32], align 4
- call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
- %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
- %l = load i32, ptr %arrayidx, align 4, !llvm.access.group !10
- ret i32 %l
-}
-
-; Verify noalias.addrspace metadata on a cloned load is preserved.
-define i32 @copy_noalias_addrspace_metadata_after_memcpy(i64 %x) {
-; CHECK-LABEL: define i32 @copy_noalias_addrspace_metadata_after_memcpy(
-; CHECK-SAME: i64 [[X:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
-; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !noalias.addrspace [[META10:![0-9]+]]
-; CHECK-NEXT: ret i32 [[L]]
-;
-entry:
- %data = alloca [8 x i32], align 4
- call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
- %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
- %l = load i32, ptr %arrayidx, align 4, !noalias.addrspace !12
- ret i32 %l
-}
-
-; Verify llvm.mem.parallel_loop_access metadata on a cloned load is preserved. OK
-define i32 @copy_mem_parallel_loop_access_metadata_after_memcpy(i64 %x) {
-; CHECK-LABEL: define i32 @copy_mem_parallel_loop_access_metadata_after_memcpy(
-; CHECK-SAME: i64 [[X:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
-; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.mem.parallel_loop_access [[META11:![0-9]+]]
-; CHECK-NEXT: ret i32 [[L]]
-;
-entry:
- %data = alloca [8 x i32], align 4
- call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
- %arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
- %l = load i32, ptr %arrayidx, align 4, !llvm.mem.parallel_loop_access !13
- ret i32 %l
-}
-
-!6 = !{!7}
-!7 = distinct !{!7, !8}
-!8 = distinct !{!8}
-!9 = !{i32 1}
-!10 = distinct !{}
-!12 = !{i32 5, i32 6}
-!13 = !{!14}
-!14 = distinct !{}
-
-
-
-;.
-; CHECK: [[RNG0]] = !{i32 0, i32 100}
-; CHECK: [[SCALAR_TYPE_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
-; CHECK: [[META2]] = !{!"scalar type", [[META3:![0-9]+]]}
-; CHECK: [[META3]] = !{!"root"}
-; CHECK: [[META4]] = !{}
-; CHECK: [[META5]] = !{[[META6:![0-9]+]]}
-; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
-; CHECK: [[META7]] = distinct !{[[META7]]}
-; CHECK: [[META8]] = !{i32 1}
-; CHECK: [[ACC_GRP9]] = distinct !{}
-; CHECK: [[META10]] = !{i32 5, i32 6}
-; CHECK: [[META11]] = !{[[META12:![0-9]+]]}
-; CHECK: [[META12]] = distinct !{}
-;.
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/copy-metadata-load-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/copy-metadata-load-store.ll
deleted file mode 100644
index 7cb74c3cc2d2c..0000000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/copy-metadata-load-store.ll
+++ /dev/null
@@ -1,159 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
-
-; We expect the merged vector load to retain nontemporal and tbaa, and normalization to handle
-; other load-only metadata.
-define void @lsv_copy_load_metadata(ptr %p) {
-; CHECK-LABEL: define void @lsv_copy_load_metadata(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4, !tbaa [[CHAR_TBAA0:![0-9]+]], !invariant.load [[META3:![0-9]+]], !nontemporal [[META4:![0-9]+]]
-; CHECK-NEXT: [[LD01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
-; CHECK-NEXT: [[LD1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
-; CHECK-NEXT: [[LD1_MUT_BC:%.*]] = bitcast i32 [[LD1_MUT2]] to <2 x i16>
-; CHECK-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- %ld0 = load i32, ptr %p, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6
- %ld1 = load <2 x i16>, ptr %p1, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6
- ret void
-}
-
-; Check that metadata on stores is preserved when LSV normalizes mixed-typed
-; chains (exercises copyMetadataForAccess on stores).
-define void @lsv_copy_store_metadata(ptr %p) {
-; CHECK-LABEL: define void @lsv_copy_store_metadata(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: store <2 x i32> <i32 7, i32 bitcast (<2 x i16> <i16 4, i16 5> to i32)>, ptr [[P]], align 4, !nontemporal [[META4]]
-; CHECK-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- store i32 7, ptr %p, align 4, !nontemporal !5
- store <2 x i16> <i16 4, i16 5>, ptr %p1, align 4, !nontemporal !5
- ret void
-}
-
-; Copy alias.scope and noalias metadata on vectorized stores.
-define void @lsv_copy_store_alias_metadata(ptr %p) {
-; CHECK-LABEL: define void @lsv_copy_store_alias_metadata(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: store <2 x i32> <i32 1, i32 bitcast (<2 x i16> <i16 2, i16 3> to i32)>, ptr [[P]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META5]]
-; CHECK-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- store i32 1, ptr %p, align 4, !alias.scope !11, !noalias !11
- store <2 x i16> <i16 2, i16 3>, ptr %p1, align 4, !alias.scope !11, !noalias !11
- ret void
-}
-
-; Copy access group metadata on vectorized stores.
-define void @lsv_copy_store_access_group(ptr %p) {
-; CHECK-LABEL: define void @lsv_copy_store_access_group(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: store <2 x i32> <i32 9, i32 bitcast (<2 x i16> <i16 8, i16 7> to i32)>, ptr [[P]], align 4
-; CHECK-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- store i32 9, ptr %p, align 4, !llvm.access.group !14
- store <2 x i16> <i16 8, i16 7>, ptr %p1, align 4, !llvm.access.group !14
- ret void
-}
-
-; Copy noundef metadata on vectorized stores.
-define void @lsv_copy_store_noundef(ptr %p) {
-; CHECK-LABEL: define void @lsv_copy_store_noundef(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: store <2 x i32> <i32 42, i32 bitcast (<2 x i16> <i16 6, i16 5> to i32)>, ptr [[P]], align 4
-; CHECK-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- store i32 42, ptr %p, align 4, !noundef !15
- store <2 x i16> <i16 6, i16 5>, ptr %p1, align 4, !noundef !15
- ret void
-}
-
-; Copy noalias.addrspace metadata on vectorized stores.
-define void @lsv_copy_store_noalias_addrspace(ptr %p) {
-; CHECK-LABEL: define void @lsv_copy_store_noalias_addrspace(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: store <2 x i32> <i32 11, i32 bitcast (<2 x i16> <i16 10, i16 9> to i32)>, ptr [[P]], align 4
-; CHECK-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- store i32 11, ptr %p, align 4, !noalias.addrspace !16
- store <2 x i16> <i16 10, i16 9>, ptr %p1, align 4, !noalias.addrspace !16
- ret void
-}
-
-; Copy llvm.mem.parallel_loop_access metadata on vectorized stores.
-define void @lsv_copy_store_mem_parallel_loop_access(ptr %p) {
-; CHECK-LABEL: define void @lsv_copy_store_mem_parallel_loop_access(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: store <2 x i32> <i32 13, i32 bitcast (<2 x i16> <i16 12, i16 11> to i32)>, ptr [[P]], align 4
-; CHECK-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- store i32 13, ptr %p, align 4, !llvm.mem.parallel_loop_access !17
- store <2 x i16> <i16 12, i16 11>, ptr %p1, align 4, !llvm.mem.parallel_loop_access !17
- ret void
-}
-
-; Normalized type is not a pointer in the following test, avoid copying
-; dereferenceable_or_null metadata.
-define void @lsv_no_copy_deref_or_null(ptr %p) {
-; CHECK-LABEL: define void @lsv_no_copy_deref_or_null(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[P]], align 8
-; CHECK-NEXT: [[LD0_MUT1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT: [[LD12:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT: [[LD0_MUT_BC:%.*]] = inttoptr i64 [[LD0_MUT1]] to ptr
-; CHECK-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- %ld0 = load ptr, ptr %p, align 4, !dereferenceable_or_null !7
- %ld1 = load i64, ptr %p1, align 4
- ret void
-}
-
-!0 = !{!3, !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
-!5 = !{i32 1}
-!6 = !{}
-!7 = !{i64 8}
-!8 = !{i64 1, i64 256}
-!11 = !{!12}
-!12 = distinct !{!12, !13}
-!13 = distinct !{!13}
-!14 = distinct !{}
-!15 = !{}
-!16 = !{i32 5, i32 6}
-!17 = !{!18}
-!18 = distinct !{}
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-;.
-; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
-; CHECK: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
-; CHECK: [[META2]] = !{!"Simple C/C++ TBAA"}
-; CHECK: [[META3]] = !{}
-; CHECK: [[META4]] = !{i32 1}
-; CHECK: [[META5]] = !{[[META6:![0-9]+]]}
-; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
-; CHECK: [[META7]] = distinct !{[[META7]]}
-;.
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
index 64e8b1afb8c80..c53f4b6d7ff2b 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll
@@ -1,273 +1,57 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
-define void @no_merge_i16_half(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @no_merge_i16_half(
+define void @merge_i32_v2i16_f32_v4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
+; CHECK-LABEL: define void @merge_i32_v2i16_f32_v4i8(
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT: [[LOAD_0:%.*]] = load i16, ptr addrspace(1) [[PTR1]], align 2
-; CHECK-NEXT: [[LOAD_1:%.*]] = load half, ptr addrspace(1) [[GEP_1]], align 2
-; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT: store i16 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 2
-; CHECK-NEXT: store half [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 2
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i16, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i16, ptr addrspace(1) %ptr1
- %load.1 = load half, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i16, ptr addrspace(2) %ptr2, i64 1
- store i16 %load.0, ptr addrspace(2) %ptr2
- store half %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @no_merge_i16_float(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @no_merge_i16_float(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT: [[LOAD_0:%.*]] = load i16, ptr addrspace(1) [[PTR1]], align 2
-; CHECK-NEXT: [[LOAD_1:%.*]] = load float, ptr addrspace(1) [[GEP_1]], align 4
-; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i16, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT: store i16 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 2
-; CHECK-NEXT: store float [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 4
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i16, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i16, ptr addrspace(1) %ptr1
- %load.1 = load float, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i16, ptr addrspace(2) %ptr2, i64 1
- store i16 %load.0, ptr addrspace(2) %ptr2
- store float %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @merge_i32_v2i16(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @merge_i32_v2i16(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD_1_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; CHECK-NEXT: [[LOAD_1_MUT_BC:%.*]] = bitcast i32 [[LOAD_1_MUT2]] to <2 x i16>
-; CHECK-NEXT: [[LOAD_1_BC:%.*]] = bitcast <2 x i16> [[LOAD_1_MUT_BC]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_01]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD_1_BC]], i32 1
-; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(2) [[PTR2]], align 4
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i32, ptr addrspace(1) %ptr1
- %load.1 = load <2 x i16>, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1
- store i32 %load.0, ptr addrspace(2) %ptr2
- store <2 x i16> %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @no_merge_i32_ptr(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @no_merge_i32_ptr(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-NEXT: [[LOAD_1:%.*]] = load ptr, ptr addrspace(1) [[GEP_1]], align 8
-; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT: store i32 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 4
-; CHECK-NEXT: store ptr [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 8
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i32, ptr addrspace(1) %ptr1
- %load.1 = load ptr, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1
- store i32 %load.0, ptr addrspace(2) %ptr2
- store ptr %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @no_merge_i32_half(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @no_merge_i32_half(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-NEXT: [[LOAD_1:%.*]] = load half, ptr addrspace(1) [[GEP_1]], align 2
-; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT: store i32 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 4
-; CHECK-NEXT: store half [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 2
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i32, ptr addrspace(1) %ptr1
- %load.1 = load half, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1
- store i32 %load.0, ptr addrspace(2) %ptr2
- store half %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @merge_i32_float(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @merge_i32_float(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD_12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[LOAD_12]] to float
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_01]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP2]] to i32
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i32 1
-; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(2) [[PTR2]], align 4
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i32, ptr addrspace(1) %ptr1
- %load.1 = load float, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1
- store i32 %load.0, ptr addrspace(2) %ptr2
- store float %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @no_merge_i32_double(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @no_merge_i32_double(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-NEXT: [[LOAD_1:%.*]] = load double, ptr addrspace(1) [[GEP_1]], align 8
-; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT: store i32 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 4
-; CHECK-NEXT: store double [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 8
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i32, ptr addrspace(1) %ptr1
- %load.1 = load double, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 1
- store i32 %load.0, ptr addrspace(2) %ptr2
- store double %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @merge_i64_ptr(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @merge_i64_ptr(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR1]], align 8
-; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD_12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[LOAD_12]] to ptr
-; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT: store i64 [[LOAD_01]], ptr addrspace(2) [[PTR2]], align 8
-; CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(2) [[STORE_GEP_1]], align 8
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i64, ptr addrspace(1) %ptr1
- %load.1 = load ptr, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1
- store i64 %load.0, ptr addrspace(2) %ptr2
- store ptr %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @no_merge_i64_float(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @no_merge_i64_float(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT: [[LOAD_0:%.*]] = load i64, ptr addrspace(1) [[PTR1]], align 8
-; CHECK-NEXT: [[LOAD_1:%.*]] = load float, ptr addrspace(1) [[GEP_1]], align 4
-; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT: store i64 [[LOAD_0]], ptr addrspace(2) [[PTR2]], align 8
-; CHECK-NEXT: store float [[LOAD_1]], ptr addrspace(2) [[STORE_GEP_1]], align 4
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i64, ptr addrspace(1) %ptr1
- %load.1 = load float, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1
- store i64 %load.0, ptr addrspace(2) %ptr2
- store float %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @merge_i64_double(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @merge_i64_double(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR1]], align 8
-; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD_12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[LOAD_12]] to double
-; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT: store i64 [[LOAD_01]], ptr addrspace(2) [[PTR2]], align 8
-; CHECK-NEXT: store double [[TMP2]], ptr addrspace(2) [[STORE_GEP_1]], align 8
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i64, ptr addrspace(1) %ptr1
- %load.1 = load double, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1
- store i64 %load.0, ptr addrspace(2) %ptr2
- store double %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @merge_i64_v2i32(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @merge_i64_v2i32(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR1]], align 8
-; CHECK-NEXT: [[LOAD_01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD_1_MUT2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT: [[LOAD_1_MUT_BC:%.*]] = bitcast i64 [[LOAD_1_MUT2]] to <2 x i32>
-; CHECK-NEXT: [[STORE_GEP_1:%.*]] = getelementptr inbounds i64, ptr addrspace(2) [[PTR2]], i64 1
-; CHECK-NEXT: store i64 [[LOAD_01]], ptr addrspace(2) [[PTR2]], align 8
-; CHECK-NEXT: [[LOAD_1_BC:%.*]] = bitcast <2 x i32> [[LOAD_1_MUT_BC]] to i64
-; CHECK-NEXT: store i64 [[LOAD_1_BC]], ptr addrspace(2) [[STORE_GEP_1]], align 8
-; CHECK-NEXT: ret void
-;
- %gep.1 = getelementptr inbounds i64, ptr addrspace(1) %ptr1, i64 1
- %load.0 = load i64, ptr addrspace(1) %ptr1
- %load.1 = load <2 x i32>, ptr addrspace(1) %gep.1
- %store.gep.1 = getelementptr inbounds i64, ptr addrspace(2) %ptr2, i64 1
- store i64 %load.0, ptr addrspace(2) %ptr2
- store <2 x i32> %load.1, ptr addrspace(2) %store.gep.1
- ret void
-}
-
-define void @merge_i32_v2i16_v4i8(ptr addrspace(1) %ptr1) {
-; CHECK-LABEL: define void @merge_i32_v2i16_v4i8(
-; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]]) {
-; CHECK-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[GEP2]], align 4
-; CHECK-NEXT: [[LOAD2_MUT1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD4_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; CHECK-NEXT: [[LOAD2_MUT_BC:%.*]] = bitcast i32 [[LOAD2_MUT1]] to <2 x i16>
-; CHECK-NEXT: [[LOAD4_MUT_BC:%.*]] = bitcast i32 [[LOAD4_MUT2]] to <4 x i8>
-; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 3
-; CHECK-NEXT: [[LOAD_3:%.*]] = load float, ptr addrspace(1) [[GEP_3]], align 4
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR1]], i64 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 2
+; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR1]], i64 3
+; CHECK-NEXT: [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4
+; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
+; CHECK-NEXT: store i32 [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
+; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(2) [[PTR2]], i64 1
+; CHECK-NEXT: store <2 x i16> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
+; CHECK-NEXT: [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2
+; CHECK-NEXT: store float [[LOAD3]], ptr addrspace(2) [[STORE_GEP3]], align 4
+; CHECK-NEXT: [[STORE_GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(2) [[PTR2]], i64 3
+; CHECK-NEXT: store <4 x i8> [[LOAD4]], ptr addrspace(2) [[STORE_GEP4]], align 4
; CHECK-NEXT: ret void
;
- %load.0 = load i32, ptr addrspace(1) %ptr1, align 4
- %gep.1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 1
- %load.1 = load <2 x i16>, ptr addrspace(1) %gep.1, align 4
- %gep.2 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 2
- %load.2 = load <4 x i8>, ptr addrspace(1) %gep.2, align 4
- %gep.3 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 3
- %load.3 = load float, ptr addrspace(1) %gep.3, align 4
+ %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0
+ %load1 = load i32, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr1, i64 1
+ %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4
+ %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 2
+ %load3 = load float, ptr addrspace(1) %gep3, align 4
+ %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr1, i64 3
+ %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4
+ %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0
+ store i32 %load1, ptr addrspace(2) %store.gep1, align 4
+ %store.gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(2) %ptr2, i64 1
+ store <2 x i16> %load2, ptr addrspace(2) %store.gep2, align 4
+ %store.gep3 = getelementptr inbounds float, ptr addrspace(2) %ptr2, i64 2
+ store float %load3, ptr addrspace(2) %store.gep3, align 4
+ %store.gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(2) %ptr2, i64 3
+ store <4 x i8> %load4, ptr addrspace(2) %store.gep4, align 4
ret void
}
-define void @merge_float_v2f16_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @merge_float_v2f16_type(
+define void @merge_f32_v2f16_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
+; CHECK-LABEL: define void @merge_f32_v2f16_type(
; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: [[LOAD1_MUT1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD2_MUT2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; CHECK-NEXT: [[LOAD1_TOORIG:%.*]] = bitcast i32 [[LOAD1_MUT1]] to float
-; CHECK-NEXT: [[LOAD2_TOORIG:%.*]] = bitcast i32 [[LOAD2_MUT2]] to <2 x half>
+; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4
; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
-; CHECK-NEXT: [[LOAD1_BC:%.*]] = bitcast float [[LOAD1_TOORIG]] to i32
-; CHECK-NEXT: [[LOAD2_BC:%.*]] = bitcast <2 x half> [[LOAD2_TOORIG]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD1_BC]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOAD2_BC]], i32 1
-; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(2) [[STORE_GEP1]], align 4
+; CHECK-NEXT: store float [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
+; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1
+; CHECK-NEXT: store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
; CHECK-NEXT: ret void
;
%gep1 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 0
@@ -304,3 +88,27 @@ define void @merge_v2f16_bfloat_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %p
store <2 x half> %load2, ptr addrspace(2) %store.gep2, align 4
ret void
}
+
+define void @no_merge_mixed_ptr_addrspaces(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
+; CHECK-LABEL: define void @no_merge_mixed_ptr_addrspaces(
+; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[PTR1]], i64 0
+; CHECK-NEXT: [[LOAD1:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) [[PTR1]], i64 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load ptr addrspace(2), ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
+; CHECK-NEXT: store ptr addrspace(1) [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4
+; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) [[PTR2]], i64 1
+; CHECK-NEXT: store ptr addrspace(2) [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4
+; CHECK-NEXT: ret void
+;
+ %gep1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %ptr1, i64 0
+ %load1 = load ptr addrspace(1), ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) %ptr1, i64 1
+ %load2 = load ptr addrspace(2), ptr addrspace(1) %gep2, align 4
+ %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0
+ store ptr addrspace(1) %load1, ptr addrspace(2) %store.gep1, align 4
+ %store.gep2 = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) %ptr2, i64 1
+ store ptr addrspace(2) %load2, ptr addrspace(2) %store.gep2, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index e6f2be25030c5..d6b51039d5b44 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -130,302 +130,24 @@ entry:
ret void
}
+; Ideally this would be merged
define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 {
; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16(
; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[A]], align 4
-; CHECK-NEXT: [[LD_01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
-; CHECK-NEXT: [[LD_1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
-; CHECK-NEXT: [[LD_1_TOORIG:%.*]] = bitcast i32 [[LD_1_MUT2]] to <2 x i16>
-; CHECK-NEXT: ret void
-;
-entry:
- %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1
-
- %ld.0 = load i32, ptr addrspace(1) %a
- %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1
-
- ret void
-}
-
-define amdgpu_kernel void @no_merge_load_i32_v2i8(ptr addrspace(1) nocapture %a) #0 {
-; CHECK-LABEL: define amdgpu_kernel void @no_merge_load_i32_v2i8(
-; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1
; CHECK-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
-; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i8>, ptr addrspace(1) [[A_1]], align 2
+; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4
; CHECK-NEXT: ret void
;
entry:
%a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1
%ld.0 = load i32, ptr addrspace(1) %a
- %ld.1 = load <2 x i8>, ptr addrspace(1) %a.1
-
- ret void
-}
-
-define void @test_normalize_loads(ptr %p) {
-; CHECK-OOB-RELAXED-LABEL: define void @test_normalize_loads(
-; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4
-; CHECK-OOB-RELAXED-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
-; CHECK-OOB-RELAXED-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
-; CHECK-OOB-RELAXED-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16>
-; CHECK-OOB-RELAXED-NEXT: [[L0_EXT:%.*]] = zext i32 [[L01]] to i64
-; CHECK-OOB-RELAXED-NEXT: [[L1_CAST:%.*]] = bitcast <2 x i16> [[L1_MUT_BC]] to i32
-; CHECK-OOB-RELAXED-NEXT: [[L1_EXT:%.*]] = zext i32 [[L1_CAST]] to i64
-; CHECK-OOB-RELAXED-NEXT: [[ADD:%.*]] = add i64 [[L0_EXT]], [[L1_EXT]]
-; CHECK-OOB-RELAXED-NEXT: store i64 [[ADD]], ptr null, align 8
-; CHECK-OOB-RELAXED-NEXT: ret void
-;
-; CHECK-OOB-STRICT-LABEL: define void @test_normalize_loads(
-; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) {
-; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-STRICT-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4
-; CHECK-OOB-STRICT-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
-; CHECK-OOB-STRICT-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
-; CHECK-OOB-STRICT-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16>
-; CHECK-OOB-STRICT-NEXT: [[L0_EXT:%.*]] = zext i32 [[L01]] to i64
-; CHECK-OOB-STRICT-NEXT: [[L1_CAST:%.*]] = bitcast <2 x i16> [[L1_MUT_BC]] to i32
-; CHECK-OOB-STRICT-NEXT: [[L1_EXT:%.*]] = zext i32 [[L1_CAST]] to i64
-; CHECK-OOB-STRICT-NEXT: [[ADD:%.*]] = add i64 [[L0_EXT]], [[L1_EXT]]
-; CHECK-OOB-STRICT-NEXT: store i64 [[ADD]], ptr null, align 8
-; CHECK-OOB-STRICT-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- %l0 = load i32, ptr %p
- %l1 = load <2 x i16>, ptr %p1
- %l0_ext = zext i32 %l0 to i64
- %l1_cast = bitcast <2 x i16> %l1 to i32
- %l1_ext = zext i32 %l1_cast to i64
- %add = add i64 %l0_ext, %l1_ext
- store i64 %add, ptr null
- ret void
-}
-
-define void @test_normalize_stores(ptr %p) {
-; CHECK-OOB-RELAXED-LABEL: define void @test_normalize_stores(
-; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR1]] {
-; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-RELAXED-NEXT: store <2 x i32> <i32 123, i32 bitcast (<2 x i16> <i16 4, i16 5> to i32)>, ptr [[P]], align 4
-; CHECK-OOB-RELAXED-NEXT: ret void
-;
-; CHECK-OOB-STRICT-LABEL: define void @test_normalize_stores(
-; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) {
-; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-STRICT-NEXT: store <2 x i32> <i32 123, i32 bitcast (<2 x i16> <i16 4, i16 5> to i32)>, ptr [[P]], align 4
-; CHECK-OOB-STRICT-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- store i32 123, ptr %p
- store <2 x i16> <i16 4, i16 5>, ptr %p1
- ret void
-}
-
-; TODO: Fix the below test
-; Check that metadata on loads is preserved when LSV normalizes mixed-typed
-; chains (exercises copyMetadataForAccess on loads).
-define void @lsv_copy_load_metadata(ptr %p) {
-; CHECK-OOB-RELAXED-LABEL: define void @lsv_copy_load_metadata(
-; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]], !invariant.load [[META3:![0-9]+]], !nontemporal [[META4:![0-9]+]]
-; CHECK-OOB-RELAXED-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
-; CHECK-OOB-RELAXED-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
-; CHECK-OOB-RELAXED-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16>
-; CHECK-OOB-RELAXED-NEXT: ret void
-;
-; CHECK-OOB-STRICT-LABEL: define void @lsv_copy_load_metadata(
-; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) {
-; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-STRICT-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]], !invariant.load [[META3:![0-9]+]], !nontemporal [[META4:![0-9]+]]
-; CHECK-OOB-STRICT-NEXT: [[L01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
-; CHECK-OOB-STRICT-NEXT: [[L1_MUT2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
-; CHECK-OOB-STRICT-NEXT: [[L1_MUT_BC:%.*]] = bitcast i32 [[L1_MUT2]] to <2 x i16>
-; CHECK-OOB-STRICT-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- %ld0 = load i32, ptr %p, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6
- %ld1 = load <2 x i16>, ptr %p1, align 4, !tbaa !0, !nontemporal !5, !invariant.load !6
- ret void
-}
+ %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1
-; Check that metadata on stores is preserved when LSV normalizes mixed-typed
-; chains (exercises copyMetadataForAccess on stores).
-define void @lsv_copy_store_metadata(ptr %p) {
-; CHECK-OOB-RELAXED-LABEL: define void @lsv_copy_store_metadata(
-; CHECK-OOB-RELAXED-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-RELAXED-NEXT: store <2 x i32> <i32 7, i32 bitcast (<2 x i16> <i16 4, i16 5> to i32)>, ptr [[P]], align 4, !nontemporal [[META4]]
-; CHECK-OOB-RELAXED-NEXT: ret void
-;
-; CHECK-OOB-STRICT-LABEL: define void @lsv_copy_store_metadata(
-; CHECK-OOB-STRICT-SAME: ptr [[P:%.*]]) {
-; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-STRICT-NEXT: store <2 x i32> <i32 7, i32 bitcast (<2 x i16> <i16 4, i16 5> to i32)>, ptr [[P]], align 4, !nontemporal [[META4]]
-; CHECK-OOB-STRICT-NEXT: ret void
-;
-entry:
- %p1 = getelementptr i32, ptr %p, i64 1
- store i32 7, ptr %p, align 4, !nontemporal !5
- store <2 x i16> <i16 4, i16 5>, ptr %p1, align 4, !nontemporal !5
ret void
}
-!0 = !{!3, !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
-!5 = !{i32 1}
-!6 = !{}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
-
-
-; Non power-of-two combined span (12 bytes) must not merge chains.
-define void @no_merge_non_pot_span(ptr addrspace(1) %p) {
-; CHECK-OOB-RELAXED-LABEL: define void @no_merge_non_pot_span(
-; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[P:%.*]]) #[[ATTR1]] {
-; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-RELAXED-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
-; CHECK-OOB-RELAXED-NEXT: [[P8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 8
-; CHECK-OOB-RELAXED-NEXT: [[L1:%.*]] = load float, ptr addrspace(1) [[P8]], align 4
-; CHECK-OOB-RELAXED-NEXT: ret void
-;
-; CHECK-OOB-STRICT-LABEL: define void @no_merge_non_pot_span(
-; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[P:%.*]]) {
-; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-STRICT-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
-; CHECK-OOB-STRICT-NEXT: [[P8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 8
-; CHECK-OOB-STRICT-NEXT: [[L1:%.*]] = load float, ptr addrspace(1) [[P8]], align 4
-; CHECK-OOB-STRICT-NEXT: ret void
-;
-entry:
- %l0 = load i32, ptr addrspace(1) %p, align 4
- %p8 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 8
- %l1 = load float, ptr addrspace(1) %p8, align 4
- ret void
-}
-
-define void @no_merge_diff_ptrop(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
-; CHECK-OOB-RELAXED-LABEL: define void @no_merge_diff_ptrop(
-; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) #[[ATTR1]] {
-; CHECK-OOB-RELAXED-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-OOB-RELAXED-NEXT: [[LOAD_1:%.*]] = load i32, ptr addrspace(2) [[PTR2]], align 4
-; CHECK-OOB-RELAXED-NEXT: ret void
-;
-; CHECK-OOB-STRICT-LABEL: define void @no_merge_diff_ptrop(
-; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-OOB-STRICT-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-OOB-STRICT-NEXT: [[LOAD_1:%.*]] = load i32, ptr addrspace(2) [[PTR2]], align 4
-; CHECK-OOB-STRICT-NEXT: ret void
-;
- %load.0 = load i32, ptr addrspace(1) %ptr1
- %load.1 = load i32, ptr addrspace(2) %ptr2
- ret void
-}
-
-define void @no_merge_load_store(ptr addrspace(1) %ptr1) {
-; CHECK-OOB-RELAXED-LABEL: define void @no_merge_load_store(
-; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]]) #[[ATTR1]] {
-; CHECK-OOB-RELAXED-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-OOB-RELAXED-NEXT: store i32 111, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-OOB-RELAXED-NEXT: ret void
-;
-; CHECK-OOB-STRICT-LABEL: define void @no_merge_load_store(
-; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]]) {
-; CHECK-OOB-STRICT-NEXT: [[LOAD_0:%.*]] = load i32, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-OOB-STRICT-NEXT: store i32 111, ptr addrspace(1) [[PTR1]], align 4
-; CHECK-OOB-STRICT-NEXT: ret void
-;
- %load.0 = load i32, ptr addrspace(1) %ptr1
- store i32 111, ptr addrspace(1) %ptr1
- ret void
-}
-
-; Stores in this test should not be vectorized as the total byte span
-; from the end of %gep.a to the end of %gep.b is not a power of 2. This
-; is a necessary condition for splitChainByAlignment.
-define void @check_contiguity_of_base_ptrs(ptr addrspace(1) %ptr) {
-; CHECK-OOB-RELAXED-LABEL: define void @check_contiguity_of_base_ptrs(
-; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-OOB-RELAXED-NEXT: store i32 274, ptr addrspace(1) [[PTR]], align 4
-; CHECK-OOB-RELAXED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 4
-; CHECK-OOB-RELAXED-NEXT: store i64 3610770474484254748, ptr addrspace(1) [[GEP_A]], align 8
-; CHECK-OOB-RELAXED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 12
-; CHECK-OOB-RELAXED-NEXT: store <2 x i32> <i32 1819043144, i32 1867980911>, ptr addrspace(1) [[GEP_B]], align 4
-; CHECK-OOB-RELAXED-NEXT: ret void
-;
-; CHECK-OOB-STRICT-LABEL: define void @check_contiguity_of_base_ptrs(
-; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-OOB-STRICT-NEXT: store i32 274, ptr addrspace(1) [[PTR]], align 4
-; CHECK-OOB-STRICT-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 4
-; CHECK-OOB-STRICT-NEXT: store i64 3610770474484254748, ptr addrspace(1) [[GEP_A]], align 8
-; CHECK-OOB-STRICT-NEXT: [[GEP_B:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[PTR]], i64 12
-; CHECK-OOB-STRICT-NEXT: store <2 x i32> <i32 1819043144, i32 1867980911>, ptr addrspace(1) [[GEP_B]], align 4
-; CHECK-OOB-STRICT-NEXT: ret void
-;
- store i32 274, ptr addrspace(1) %ptr, align 4
- %gep.a = getelementptr inbounds nuw i8, ptr addrspace(1) %ptr, i64 4
- store i64 3610770474484254748, ptr addrspace(1) %gep.a, align 8
- %gep.b = getelementptr inbounds nuw i8, ptr addrspace(1) %ptr, i64 12
- store <2 x i32> <i32 1819043144, i32 1867980911>, ptr addrspace(1) %gep.b, align 4
- ret void
-}
-
-; Offset is unknown in the following test, LSV should fail to vectorize.
-define amdgpu_kernel void @assert_computeLeaderDelta(ptr addrspace(1) %a, i64 %idx) {
-; CHECK-OOB-RELAXED-LABEL: define amdgpu_kernel void @assert_computeLeaderDelta(
-; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[A:%.*]], i64 [[IDX:%.*]]) #[[ATTR1]] {
-; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-RELAXED-NEXT: [[LD0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
-; CHECK-OOB-RELAXED-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[A]], i64 [[IDX]]
-; CHECK-OOB-RELAXED-NEXT: [[LD1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2
-; CHECK-OOB-RELAXED-NEXT: ret void
-;
-; CHECK-OOB-STRICT-LABEL: define amdgpu_kernel void @assert_computeLeaderDelta(
-; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[A:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-STRICT-NEXT: [[LD0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
-; CHECK-OOB-STRICT-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[A]], i64 [[IDX]]
-; CHECK-OOB-STRICT-NEXT: [[LD1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2
-; CHECK-OOB-STRICT-NEXT: ret void
-;
-entry:
- %ld0 = load i32, ptr addrspace(1) %a, align 4
- %p1 = getelementptr inbounds i8, ptr addrspace(1) %a, i64 %idx
- %ld1 = load <2 x i16>, ptr addrspace(1) %p1, align 2
- ret void
-}
-
-
-; Overlapping ranges after rebasing should prevent merging across chains.
-define void @no_merge_overlap_after_rebase(ptr addrspace(1) %p) {
-; CHECK-OOB-RELAXED-LABEL: define void @no_merge_overlap_after_rebase(
-; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[P:%.*]]) #[[ATTR1]] {
-; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-RELAXED-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
-; CHECK-OOB-RELAXED-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 2
-; CHECK-OOB-RELAXED-NEXT: [[L1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2
-; CHECK-OOB-RELAXED-NEXT: ret void
-;
-; CHECK-OOB-STRICT-LABEL: define void @no_merge_overlap_after_rebase(
-; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[P:%.*]]) {
-; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
-; CHECK-OOB-STRICT-NEXT: [[L0:%.*]] = load i32, ptr addrspace(1) [[P]], align 4
-; CHECK-OOB-STRICT-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 2
-; CHECK-OOB-STRICT-NEXT: [[L1:%.*]] = load <2 x i16>, ptr addrspace(1) [[P1]], align 2
-; CHECK-OOB-STRICT-NEXT: ret void
-;
-entry:
- %l0 = load i32, ptr addrspace(1) %p, align 4
- %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 2
- %l1 = load <2 x i16>, ptr addrspace(1) %p1, align 2
- ret void
-}
More information about the llvm-commits
mailing list