[llvm] 3e53c97 - Revert "[AMDGPU] Add IR LiveReg type-based optimization" (#97138)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 28 23:18:30 PDT 2024
Author: Vitaly Buka
Date: 2024-06-28T23:18:26-07:00
New Revision: 3e53c97d33210db68188e731e93ee48dbaeeae32
URL: https://github.com/llvm/llvm-project/commit/3e53c97d33210db68188e731e93ee48dbaeeae32
DIFF: https://github.com/llvm/llvm-project/commit/3e53c97d33210db68188e731e93ee48dbaeeae32.diff
LOG: Revert "[AMDGPU] Add IR LiveReg type-based optimization" (#97138)
Part of #66838.
https://lab.llvm.org/buildbot/#/builders/52/builds/404
https://lab.llvm.org/buildbot/#/builders/55/builds/358
https://lab.llvm.org/buildbot/#/builders/164/builds/518
This reverts commit ded956440739ae326a99cbaef18ce4362e972679.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
llvm/test/CodeGen/AMDGPU/extract-subvector.ll
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
Removed:
llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 7623b73d6dd5f..69fdeaebe0a01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -81,73 +81,6 @@ class AMDGPULateCodeGenPrepare
bool visitLoadInst(LoadInst &LI);
};
-using ValueToValueMap = DenseMap<const Value *, Value *>;
-
-class LiveRegOptimizer {
-private:
- Module *Mod = nullptr;
- const DataLayout *DL = nullptr;
- const GCNSubtarget *ST;
- /// The scalar type to convert to
- Type *ConvertToScalar;
- /// The set of visited Instructions
- SmallPtrSet<Instruction *, 4> Visited;
- /// The set of Instructions to be deleted
- SmallPtrSet<Instruction *, 4> DeadInstrs;
- /// Map of Value -> Converted Value
- ValueToValueMap ValMap;
- /// Map of containing conversions from Optimal Type -> Original Type per BB.
- DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
-
-public:
- /// Calculate the and \p return the type to convert to given a problematic \p
- /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
- Type *calculateConvertType(Type *OriginalType);
- /// Convert the virtual register defined by \p V to the compatible vector of
- /// legal type
- Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
- /// Convert the virtual register defined by \p V back to the original type \p
- /// ConvertType, stripping away the MSBs in cases where there was an imperfect
- /// fit (e.g. v2i32 -> v7i8)
- Value *convertFromOptType(Type *ConvertType, Instruction *V,
- BasicBlock::iterator &InstPt,
- BasicBlock *InsertBlock);
- /// Check for problematic PHI nodes or cross-bb values based on the value
- /// defined by \p I, and coerce to legal types if necessary. For problematic
- /// PHI node, we coerce all incoming values in a single invocation.
- bool optimizeLiveType(Instruction *I);
-
- /// Remove all instructions that have become dead (i.e. all the re-typed PHIs)
- void removeDeadInstrs();
-
- // Whether or not the type should be replaced to avoid inefficient
- // legalization code
- bool shouldReplace(Type *ITy) {
- FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
- if (!VTy)
- return false;
-
- auto TLI = ST->getTargetLowering();
-
- Type *EltTy = VTy->getElementType();
- // If the element size is not less than the convert to scalar size, then we
- // can't do any bit packing
- if (!EltTy->isIntegerTy() ||
- EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
- return false;
-
- // Only coerce illegal types
- TargetLoweringBase::LegalizeKind LK =
- TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
- return LK.first != TargetLoweringBase::TypeLegal;
- }
-
- LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
- DL = &Mod->getDataLayout();
- ConvertToScalar = Type::getInt32Ty(Mod->getContext());
- }
-};
-
} // end anonymous namespace
bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
@@ -169,238 +102,14 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- // "Optimize" the virtual regs that cross basic block boundaries. When
- // building the SelectionDAG, vectors of illegal types that cross basic blocks
- // will be scalarized and widened, with each scalar living in its
- // own register. To work around this, this optimization converts the
- // vectors to equivalent vectors of legal type (which are converted back
- // before uses in subsequent blocks), to pack the bits into fewer physical
- // registers (used in CopyToReg/CopyFromReg pairs).
- LiveRegOptimizer LRO(Mod, &ST);
-
bool Changed = false;
-
for (auto &BB : F)
- for (Instruction &I : make_early_inc_range(BB)) {
+ for (Instruction &I : llvm::make_early_inc_range(BB))
Changed |= visit(I);
- Changed |= LRO.optimizeLiveType(&I);
- }
- LRO.removeDeadInstrs();
return Changed;
}
-Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
- assert(OriginalType->getScalarSizeInBits() <=
- ConvertToScalar->getScalarSizeInBits());
-
- FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);
-
- TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
- TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
- unsigned ConvertEltCount =
- (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
-
- if (OriginalSize <= ConvertScalarSize)
- return IntegerType::get(Mod->getContext(), ConvertScalarSize);
-
- return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
- ConvertEltCount, false);
-}
-
-Value *LiveRegOptimizer::convertToOptType(Instruction *V,
- BasicBlock::iterator &InsertPt) {
- FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
- Type *NewTy = calculateConvertType(V->getType());
-
- TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
- TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
-
- IRBuilder<> Builder(V->getParent(), InsertPt);
- // If there is a bitsize match, we can fit the old vector into a new vector of
- // desired type.
- if (OriginalSize == NewSize)
- return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");
-
- // If there is a bitsize mismatch, we must use a wider vector.
- assert(NewSize > OriginalSize);
- uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
-
- SmallVector<int, 8> ShuffleMask;
- uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
- for (unsigned I = 0; I < OriginalElementCount; I++)
- ShuffleMask.push_back(I);
-
- for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
- ShuffleMask.push_back(OriginalElementCount);
-
- Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
- return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
-}
-
-Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
- BasicBlock::iterator &InsertPt,
- BasicBlock *InsertBB) {
- FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);
-
- TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
- TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
-
- IRBuilder<> Builder(InsertBB, InsertPt);
- // If there is a bitsize match, we simply convert back to the original type.
- if (OriginalSize == NewSize)
- return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");
-
- // If there is a bitsize mismatch, then we must have used a wider value to
- // hold the bits.
- assert(OriginalSize > NewSize);
- // For wide scalars, we can just truncate the value.
- if (!V->getType()->isVectorTy()) {
- Instruction *Trunc = cast<Instruction>(
- Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
- return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
- }
-
- // For wider vectors, we must strip the MSBs to convert back to the original
- // type.
- VectorType *ExpandedVT = VectorType::get(
- Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
- (OriginalSize / NewVTy->getScalarSizeInBits()), false);
- Instruction *Converted =
- cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
-
- unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
- SmallVector<int, 8> ShuffleMask(NarrowElementCount);
- std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
-
- return Builder.CreateShuffleVector(Converted, ShuffleMask);
-}
-
-bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
- SmallVector<Instruction *, 4> Worklist;
- SmallPtrSet<PHINode *, 4> PhiNodes;
- SmallPtrSet<Instruction *, 4> Defs;
- SmallPtrSet<Instruction *, 4> Uses;
-
- Worklist.push_back(cast<Instruction>(I));
- while (!Worklist.empty()) {
- Instruction *II = Worklist.pop_back_val();
-
- if (!Visited.insert(II).second)
- continue;
-
- if (!shouldReplace(II->getType()))
- continue;
-
- if (PHINode *Phi = dyn_cast<PHINode>(II)) {
- PhiNodes.insert(Phi);
- // Collect all the incoming values of problematic PHI nodes.
- for (Value *V : Phi->incoming_values()) {
- // Repeat the collection process for newly found PHI nodes.
- if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
- if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
- Worklist.push_back(OpPhi);
- continue;
- }
-
- Instruction *IncInst = dyn_cast<Instruction>(V);
- // Other incoming value types (e.g. vector literals) are unhandled
- if (!IncInst && !isa<ConstantAggregateZero>(V))
- return false;
-
- // Collect all other incoming values for coercion.
- if (IncInst)
- Defs.insert(IncInst);
- }
- }
-
- // Collect all relevant uses.
- for (User *V : II->users()) {
- // Repeat the collection process for problematic PHI nodes.
- if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
- if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
- Worklist.push_back(OpPhi);
- continue;
- }
-
- Instruction *UseInst = cast<Instruction>(V);
- // Collect all uses of PHINodes and any use the crosses BB boundaries.
- if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
- Uses.insert(UseInst);
- if (!Defs.count(II) && !isa<PHINode>(II)) {
- Defs.insert(II);
- }
- }
- }
- }
-
- // Coerce and track the defs.
- for (Instruction *D : Defs) {
- if (!ValMap.contains(D)) {
- BasicBlock::iterator InsertPt = std::next(D->getIterator());
- Value *ConvertVal = convertToOptType(D, InsertPt);
- assert(ConvertVal);
- ValMap[D] = ConvertVal;
- }
- }
-
- // Construct new-typed PHI nodes.
- for (PHINode *Phi : PhiNodes) {
- ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
- Phi->getNumIncomingValues(),
- Phi->getName() + ".tc", Phi->getIterator());
- }
-
- // Connect all the PHI nodes with their new incoming values.
- for (PHINode *Phi : PhiNodes) {
- PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
- bool MissingIncVal = false;
- for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
- Value *IncVal = Phi->getIncomingValue(I);
- if (isa<ConstantAggregateZero>(IncVal)) {
- Type *NewType = calculateConvertType(Phi->getType());
- NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
- Phi->getIncomingBlock(I));
- } else if (ValMap.contains(IncVal))
- NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
- else
- MissingIncVal = true;
- }
- DeadInstrs.insert(MissingIncVal ? cast<Instruction>(ValMap[Phi]) : Phi);
- }
- // Coerce back to the original type and replace the uses.
- for (Instruction *U : Uses) {
- // Replace all converted operands for a use.
- for (auto [OpIdx, Op] : enumerate(U->operands())) {
- if (ValMap.contains(Op)) {
- Value *NewVal = nullptr;
- if (BBUseValMap.contains(U->getParent()) &&
- BBUseValMap[U->getParent()].contains(ValMap[Op]))
- NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
- else {
- BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
- NewVal =
- convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
- InsertPt, U->getParent());
- BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
- }
- assert(NewVal);
- U->setOperand(OpIdx, NewVal);
- }
- }
- }
-
- return true;
-}
-
-void LiveRegOptimizer::removeDeadInstrs() {
- // Remove instrs that have been marked dead after type-coercion.
- for (auto *I : DeadInstrs) {
- I->replaceAllUsesWith(PoisonValue::get(I->getType()));
- I->eraseFromParent();
- }
-}
-
bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
unsigned AS = LI.getPointerAddressSpace();
// Skip non-constant address space.
@@ -410,7 +119,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
// Skip non-simple loads.
if (!LI.isSimple())
return false;
- Type *Ty = LI.getType();
+ auto *Ty = LI.getType();
// Skip aggregate types.
if (Ty->isAggregateType())
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f50a18ccc2188..9162e110aa10b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1197,10 +1197,10 @@ bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
if (TM->getOptLevel() > CodeGenOptLevel::None)
- addPass(createSinkingPass());
+ addPass(createAMDGPULateCodeGenPreparePass());
if (TM->getOptLevel() > CodeGenOptLevel::None)
- addPass(createAMDGPULateCodeGenPreparePass());
+ addPass(createSinkingPass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
deleted file mode 100644
index 83cb92210ec84..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ /dev/null
@@ -1,636 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-
-define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v3i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT: v_mov_b32_e32 v3, 8
-; GFX906-NEXT: v_mov_b32_e32 v5, 16
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v4, v2, s[4:5]
-; GFX906-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v4
-; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB0_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v0, v2, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0
-; GFX906-NEXT: .LBB0_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4
-; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX906-NEXT: v_mov_b32_e32 v1, 0
-; GFX906-NEXT: global_store_short v1, v0, s[2:3]
-; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[2:3] offset:2
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v4i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v1, v2, s[4:5]
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB1_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v1, v2, s[6:7]
-; GFX906-NEXT: .LBB1_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v5i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB2_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT: .LBB2_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v4, 0
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1
-; GFX906-NEXT: global_store_byte v4, v1, s[2:3]
-; GFX906-NEXT: global_store_byte v4, v0, s[2:3] offset:1
-; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[2:3] offset:2
-; GFX906-NEXT: global_store_byte v4, v3, s[2:3] offset:3
-; GFX906-NEXT: global_store_byte v4, v2, s[2:3] offset:4
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v8i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB3_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
-; GFX906-NEXT: .LBB3_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v16i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5]
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB4_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7]
-; GFX906-NEXT: .LBB4_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <16 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <16 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <16 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <16 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v32i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5]
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB5_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7]
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16
-; GFX906-NEXT: .LBB5_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3]
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[2:3] offset:16
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <32 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <32 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <32 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <32 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <32 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v256i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX906-NEXT: s_mov_b32 s10, -1
-; GFX906-NEXT: s_mov_b32 s11, 0xe00000
-; GFX906-NEXT: s_add_u32 s8, s8, s3
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX906-NEXT: s_addc_u32 s9, s9, 0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] offset:16
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[4:5] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[4:5] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[4:5] offset:64
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[4:5] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[4:5] offset:96
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[4:5] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[4:5] offset:128
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[4:5] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[4:5] offset:160
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[4:5] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[4:5] offset:192
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[4:5] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[4:5] offset:224
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] offset:240
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB6_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[6:7] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[6:7] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[6:7] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[6:7] offset:64
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[6:7] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[6:7] offset:96
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[6:7] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[6:7] offset:128
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[6:7] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[6:7] offset:160
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[6:7] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[6:7] offset:192
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[6:7] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[6:7] offset:224
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] offset:240
-; GFX906-NEXT: .LBB6_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT: v_mov_b32_e32 v0, v57
-; GFX906-NEXT: v_mov_b32_e32 v1, v58
-; GFX906-NEXT: v_mov_b32_e32 v2, v59
-; GFX906-NEXT: v_mov_b32_e32 v3, v60
-; GFX906-NEXT: v_mov_b32_e32 v60, v56
-; GFX906-NEXT: v_mov_b32_e32 v59, v55
-; GFX906-NEXT: v_mov_b32_e32 v58, v54
-; GFX906-NEXT: v_mov_b32_e32 v57, v53
-; GFX906-NEXT: v_mov_b32_e32 v56, v52
-; GFX906-NEXT: v_mov_b32_e32 v55, v51
-; GFX906-NEXT: v_mov_b32_e32 v54, v50
-; GFX906-NEXT: v_mov_b32_e32 v53, v49
-; GFX906-NEXT: v_mov_b32_e32 v52, v48
-; GFX906-NEXT: v_mov_b32_e32 v51, v47
-; GFX906-NEXT: v_mov_b32_e32 v50, v46
-; GFX906-NEXT: v_mov_b32_e32 v49, v45
-; GFX906-NEXT: v_mov_b32_e32 v48, v44
-; GFX906-NEXT: v_mov_b32_e32 v47, v43
-; GFX906-NEXT: v_mov_b32_e32 v46, v42
-; GFX906-NEXT: v_mov_b32_e32 v45, v41
-; GFX906-NEXT: v_mov_b32_e32 v44, v40
-; GFX906-NEXT: v_mov_b32_e32 v43, v39
-; GFX906-NEXT: v_mov_b32_e32 v42, v38
-; GFX906-NEXT: v_mov_b32_e32 v41, v37
-; GFX906-NEXT: v_mov_b32_e32 v40, v36
-; GFX906-NEXT: v_mov_b32_e32 v39, v35
-; GFX906-NEXT: v_mov_b32_e32 v38, v34
-; GFX906-NEXT: v_mov_b32_e32 v37, v33
-; GFX906-NEXT: v_mov_b32_e32 v36, v32
-; GFX906-NEXT: v_mov_b32_e32 v35, v31
-; GFX906-NEXT: v_mov_b32_e32 v34, v30
-; GFX906-NEXT: v_mov_b32_e32 v33, v29
-; GFX906-NEXT: v_mov_b32_e32 v32, v28
-; GFX906-NEXT: v_mov_b32_e32 v31, v27
-; GFX906-NEXT: v_mov_b32_e32 v30, v26
-; GFX906-NEXT: v_mov_b32_e32 v29, v25
-; GFX906-NEXT: v_mov_b32_e32 v28, v24
-; GFX906-NEXT: v_mov_b32_e32 v27, v23
-; GFX906-NEXT: v_mov_b32_e32 v26, v22
-; GFX906-NEXT: v_mov_b32_e32 v25, v21
-; GFX906-NEXT: v_mov_b32_e32 v24, v20
-; GFX906-NEXT: v_mov_b32_e32 v23, v19
-; GFX906-NEXT: v_mov_b32_e32 v22, v18
-; GFX906-NEXT: v_mov_b32_e32 v21, v17
-; GFX906-NEXT: v_mov_b32_e32 v20, v16
-; GFX906-NEXT: v_mov_b32_e32 v19, v15
-; GFX906-NEXT: v_mov_b32_e32 v18, v14
-; GFX906-NEXT: v_mov_b32_e32 v17, v13
-; GFX906-NEXT: v_mov_b32_e32 v16, v12
-; GFX906-NEXT: v_mov_b32_e32 v15, v11
-; GFX906-NEXT: v_mov_b32_e32 v14, v10
-; GFX906-NEXT: v_mov_b32_e32 v13, v9
-; GFX906-NEXT: v_mov_b32_e32 v12, v8
-; GFX906-NEXT: v_mov_b32_e32 v11, v7
-; GFX906-NEXT: v_mov_b32_e32 v10, v6
-; GFX906-NEXT: v_mov_b32_e32 v9, v5
-; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT: v_mov_b32_e32 v4, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[2:3]
-; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[2:3] offset:16
-; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[2:3] offset:32
-; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[2:3] offset:48
-; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[2:3] offset:64
-; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[2:3] offset:80
-; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[2:3] offset:96
-; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[2:3] offset:112
-; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[2:3] offset:128
-; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[2:3] offset:144
-; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[2:3] offset:160
-; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[2:3] offset:176
-; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[2:3] offset:192
-; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[2:3] offset:208
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-
-define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: repeat_successor:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dword s2, s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_cmp_lt_i32 s2, 3
-; GFX906-NEXT: s_cbranch_scc0 .LBB7_3
-; GFX906-NEXT: ; %bb.1: ; %LeafBlock
-; GFX906-NEXT: s_cmp_ge_i32 s2, 1
-; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
-; GFX906-NEXT: ; %bb.2:
-; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX906-NEXT: global_load_dword v0, v0, s[4:5]
-; GFX906-NEXT: s_branch .LBB7_5
-; GFX906-NEXT: .LBB7_3: ; %LeafBlock5
-; GFX906-NEXT: s_cmp_eq_u32 s2, 3
-; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
-; GFX906-NEXT: ; %bb.4: ; %sw.bb5
-; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX906-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX906-NEXT: .LBB7_5: ; %return.sink.split
-; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
-; GFX906-NEXT: v_mov_b32_e32 v1, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX906-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX906-NEXT: .LBB7_6: ; %return
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
- switch i32 %in, label %return [
- i32 1, label %return.sink.split
- i32 2, label %return.sink.split
- i32 3, label %sw.bb5
- ]
-
-sw.bb5:
- br label %return.sink.split
-
-return.sink.split:
- %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
- store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-
-return:
- ret void
-}
-
-define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_phi_chain:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
-; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1
-; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB8_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3]
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT: s_and_b64 s[2:3], exec, vcc
-; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX906-NEXT: .LBB8_2: ; %Flow
-; GFX906-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT: s_cbranch_execz .LBB8_4
-; GFX906-NEXT: ; %bb.3: ; %bb.2
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5]
-; GFX906-NEXT: .LBB8_4: ; %bb.3
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- %cmp2 = icmp ult i32 %idx, 7
- br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
- %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
- br label %bb.3
-
-bb.3:
- %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
- store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
- ret void
-}
-
-define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_multi_block:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v1, v3
-; GFX906-NEXT: v_mov_b32_e32 v2, v4
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB9_4
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[2:3]
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB9_3
-; GFX906-NEXT: ; %bb.2: ; %bb.2
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5]
-; GFX906-NEXT: .LBB9_3: ; %Flow
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: .LBB9_4: ; %bb.3
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.3
-bb.1:
- %cmp2 = icmp ult i32 %idx, 7
- br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
- store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
- br label %bb.3
-
-bb.3:
- %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
- store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
- ret void
-}
-
-define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v32i8_loop_carried:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0
-; GFX906-NEXT: v_mov_b32_e32 v3, 8
-; GFX906-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v1, v1, s[2:3]
-; GFX906-NEXT: s_mov_b64 s[2:3], 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0
-; GFX906-NEXT: v_mov_b32_e32 v2, 24
-; GFX906-NEXT: .LBB10_1: ; %bb.1
-; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
-; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1
-; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX906-NEXT: s_cbranch_execnz .LBB10_1
-; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
- br label %bb.1
-
-bb.1:
- %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
- br label %bb.2
-
-bb.2:
- store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-
-declare i32 @llvm.amdgcn.workitem.id.x()
-
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index 11772d252a16f..93b9aeac3cd3f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -987,8 +987,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) {
; OPT-NEXT: entry:
; OPT-NEXT: [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
; OPT-NEXT: switch i8 [[COND:%.*]], label [[ELSE:%.*]] [
-; OPT-NEXT: i8 0, label [[THEN_1:%.*]]
-; OPT-NEXT: i8 3, label [[THEN_2:%.*]]
+; OPT-NEXT: i8 0, label [[THEN_1:%.*]]
+; OPT-NEXT: i8 3, label [[THEN_2:%.*]]
; OPT-NEXT: ]
; OPT: then.1:
; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
@@ -1025,8 +1025,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) {
; NOOPT-NEXT: entry:
; NOOPT-NEXT: [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
; NOOPT-NEXT: switch i8 [[COND:%.*]], label [[ELSE:%.*]] [
-; NOOPT-NEXT: i8 0, label [[THEN_1:%.*]]
-; NOOPT-NEXT: i8 3, label [[THEN_2:%.*]]
+; NOOPT-NEXT: i8 0, label [[THEN_1:%.*]]
+; NOOPT-NEXT: i8 3, label [[THEN_2:%.*]]
; NOOPT-NEXT: ]
; NOOPT: then.1:
; NOOPT-NEXT: br label [[FINALLY:%.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-
diff erent-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-
diff erent-sizes.ll
index 1e5ec361d154c..53acbb6a7bceb 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-
diff erent-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-
diff erent-sizes.ll
@@ -8,30 +8,29 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_bitcmp0_b32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %bb10
-; CHECK-NEXT: global_load_dwordx2 v[8:9], v0, s[8:9]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v7, 0xff, v8
-; CHECK-NEXT: v_bfe_u32 v6, v8, 8, 8
-; CHECK-NEXT: v_bfe_u32 v5, v8, 16, 8
-; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8
-; CHECK-NEXT: v_and_b32_e32 v3, 0xff, v9
-; CHECK-NEXT: v_bfe_u32 v2, v9, 8, 8
-; CHECK-NEXT: v_bfe_u32 v1, v9, 16, 8
-; CHECK-NEXT: v_lshrrev_b32_e32 v0, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v0
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v0
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v1
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .LBB0_2:
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v5, 0
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: v_mov_b32_e32 v7, 0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: .LBB0_3: ; %bb41
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x48
; CHECK-NEXT: v_mov_b32_e32 v8, s10
@@ -48,16 +47,16 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
; CHECK-NEXT: v_mov_b32_e32 v19, s21
; CHECK-NEXT: v_mov_b32_e32 v20, s22
; CHECK-NEXT: v_mov_b32_e32 v21, s23
-; CHECK-NEXT: flat_store_byte v[8:9], v7
-; CHECK-NEXT: flat_store_byte v[10:11], v6
-; CHECK-NEXT: flat_store_byte v[12:13], v5
-; CHECK-NEXT: flat_store_byte v[14:15], v4
-; CHECK-NEXT: flat_store_byte v[16:17], v3
-; CHECK-NEXT: flat_store_byte v[18:19], v2
-; CHECK-NEXT: flat_store_byte v[20:21], v1
+; CHECK-NEXT: flat_store_byte v[8:9], v0
+; CHECK-NEXT: flat_store_byte v[10:11], v7
+; CHECK-NEXT: flat_store_byte v[12:13], v6
+; CHECK-NEXT: flat_store_byte v[14:15], v5
+; CHECK-NEXT: flat_store_byte v[16:17], v1
+; CHECK-NEXT: flat_store_byte v[18:19], v4
+; CHECK-NEXT: flat_store_byte v[20:21], v3
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; CHECK-NEXT: flat_store_byte v[2:3], v0
+; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT: flat_store_byte v[0:1], v2
; CHECK-NEXT: s_endpgm
bb:
br i1 %arg, label %bb10, label %bb41
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index efbbe2b27f10f..6dabd8c0b83ea 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -13,9 +13,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -30,25 +30,27 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v3, v6, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v6, v2
+; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB0_3
; SI-NEXT: s_branch .LBB0_4
; SI-NEXT: .LBB0_2:
-; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB0_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -61,29 +63,29 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v0
-; SI-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v2, v0
+; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: .LBB0_4: ; %exit
-; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2
-; SI-NEXT: v_bfe_i32 v1, v2, 0, 16
-; SI-NEXT: v_bfe_i32 v2, v3, 0, 16
-; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v4, 1
-; SI-NEXT: v_mov_b32_e32 v5, 0xffff
-; SI-NEXT: v_mov_b32_e32 v6, 0x8000
+; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_mov_b32_e32 v3, 0xffff
+; SI-NEXT: v_mov_b32_e32 v4, 0x8000
+; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v6, 1
; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v4
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16
+; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v4
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_8xi16_extract_4xi16:
@@ -178,23 +180,26 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v6, v2
-; SI-NEXT: v_or_b32_e32 v4, v4, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; SI-NEXT: v_or_b32_e32 v3, v6, v3
+; SI-NEXT: v_or_b32_e32 v5, v5, v7
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB1_3
; SI-NEXT: s_branch .LBB1_4
; SI-NEXT: .LBB1_2:
-; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB1_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
@@ -209,39 +214,39 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_or_b32_e32 v5, v4, v0
-; SI-NEXT: v_or_b32_e32 v4, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v0
+; SI-NEXT: v_or_b32_e32 v5, v5, v1
; SI-NEXT: .LBB1_4: ; %exit
-; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4
-; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48
+; SI-NEXT: v_bfe_i32 v0, v5, 0, 16
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
-; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v5, 1
-; SI-NEXT: v_mov_b32_e32 v6, 0xffff
-; SI-NEXT: v_mov_b32_e32 v7, 0x8000
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc
+; SI-NEXT: v_bfe_i32 v3, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_mov_b32_e32 v4, 0xffff
+; SI-NEXT: v_mov_b32_e32 v5, 0x8000
+; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v7, 1
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
+; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; SI-NEXT: v_or_b32_e32 v0, v1, v8
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v2, v3, v4
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
@@ -494,9 +499,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -527,25 +532,27 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v3, v6, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v6, v2
+; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB3_3
; SI-NEXT: s_branch .LBB3_4
; SI-NEXT: .LBB3_2:
-; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB3_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -574,29 +581,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v0
-; SI-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v2, v0
+; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: .LBB3_4: ; %exit
-; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2
-; SI-NEXT: v_bfe_i32 v1, v2, 0, 16
-; SI-NEXT: v_bfe_i32 v2, v3, 0, 16
-; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v4, 1
-; SI-NEXT: v_mov_b32_e32 v5, 0xffff
-; SI-NEXT: v_mov_b32_e32 v6, 0x8000
+; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_mov_b32_e32 v3, 0xffff
+; SI-NEXT: v_mov_b32_e32 v4, 0x8000
+; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v6, 1
; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v4
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16
+; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v4
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_4xi16:
@@ -703,13 +710,13 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -727,15 +734,18 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v6, v2
-; SI-NEXT: v_or_b32_e32 v4, v4, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v7, v2
+; SI-NEXT: v_or_b32_e32 v3, v6, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB4_3
; SI-NEXT: s_branch .LBB4_4
; SI-NEXT: .LBB4_2:
-; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB4_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
@@ -750,11 +760,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -775,29 +785,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_or_b32_e32 v5, v4, v0
-; SI-NEXT: v_or_b32_e32 v4, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v2, v0
+; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: .LBB4_4: ; %exit
-; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4
-; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48
+; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
-; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v5, 1
-; SI-NEXT: v_mov_b32_e32 v6, 0xffff
-; SI-NEXT: v_mov_b32_e32 v7, 0x8000
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc
+; SI-NEXT: v_mov_b32_e32 v4, 0xffff
+; SI-NEXT: v_mov_b32_e32 v5, 0x8000
+; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v7, 1
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
+; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; SI-NEXT: v_or_b32_e32 v0, v1, v8
+; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1195,21 +1205,21 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_mov_b32 s39, 0xf000
; SI-NEXT: s_mov_b32 s36, s38
; SI-NEXT: s_mov_b32 s37, s38
-; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1227,39 +1237,46 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4
-; SI-NEXT: v_or_b32_e32 v5, v10, v2
-; SI-NEXT: v_or_b32_e32 v4, v8, v3
-; SI-NEXT: v_or_b32_e32 v3, v7, v9
-; SI-NEXT: v_or_b32_e32 v2, v6, v11
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v11, v2
+; SI-NEXT: v_or_b32_e32 v8, v8, v12
+; SI-NEXT: v_or_b32_e32 v2, v10, v13
+; SI-NEXT: v_or_b32_e32 v9, v9, v14
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB7_3
; SI-NEXT: s_branch .LBB7_4
; SI-NEXT: .LBB7_2:
-; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB7_3: ; %T
; SI-NEXT: s_mov_b32 s39, 0xf000
; SI-NEXT: s_mov_b32 s36, s38
; SI-NEXT: s_mov_b32 s37, s38
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1277,52 +1294,52 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3
-; SI-NEXT: v_or_b32_e32 v5, v8, v0
-; SI-NEXT: v_or_b32_e32 v4, v7, v1
-; SI-NEXT: v_or_b32_e32 v3, v6, v9
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v0
+; SI-NEXT: v_or_b32_e32 v8, v8, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v11
; SI-NEXT: .LBB7_4: ; %exit
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; SI-NEXT: s_movk_i32 s34, 0x3800
-; SI-NEXT: v_mov_b32_e32 v8, 0x3d000000
-; SI-NEXT: v_mov_b32_e32 v9, 0x39000000
-; SI-NEXT: v_mov_b32_e32 v10, 0x3d00
-; SI-NEXT: v_mov_b32_e32 v11, 0x3900
+; SI-NEXT: v_mov_b32_e32 v8, 0x3d00
+; SI-NEXT: v_mov_b32_e32 v9, 0x3900
+; SI-NEXT: v_mov_b32_e32 v10, 0x3d000000
+; SI-NEXT: v_mov_b32_e32 v11, 0x39000000
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v0
-; SI-NEXT: v_cndmask_b32_e32 v12, v8, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v1
-; SI-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2
-; SI-NEXT: v_cndmask_b32_e32 v13, v8, v9, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4
; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6
-; SI-NEXT: v_cndmask_b32_e32 v14, v8, v9, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5
-; SI-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7
; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6
+; SI-NEXT: v_cndmask_b32_e32 v12, v10, v11, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v12
-; SI-NEXT: v_or_b32_e32 v4, v1, v13
-; SI-NEXT: v_or_b32_e32 v6, v2, v14
-; SI-NEXT: v_or_b32_e32 v2, v3, v5
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; SI-NEXT: v_alignbit_b32 v1, v2, v12, 16
-; SI-NEXT: v_alignbit_b32 v5, v6, v13, 16
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14
+; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7
+; SI-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2
+; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4
+; SI-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v4, v5, v12
+; SI-NEXT: v_or_b32_e32 v6, v3, v7
+; SI-NEXT: v_or_b32_e32 v2, v2, v8
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_alignbit_b32 v5, v6, v12, 16
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_8xi16_0:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 36a93bd2511ce..15abf44f3a0ea 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -1,82 +1,26 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+; GCN-LABEL: extract_2xi16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: v_bfe_i32
+; GCN: v_bfe_i32
+
define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_2xi16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT: s_cbranch_execz .LBB0_2
-; GCN-NEXT: ; %bb.1: ; %F
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_ushort v0, v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:2 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:4 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:6 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:8 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:10 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:12 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64 offset:14 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_or_b32_e32 v4, v0, v1
-; GCN-NEXT: ; implicit-def: $vgpr0
-; GCN-NEXT: .LBB0_2: ; %Flow
-; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT: s_cbranch_execz .LBB0_4
-; GCN-NEXT: ; %bb.3: ; %T
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:2 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:6 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:10 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:12 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:14 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GCN-NEXT: v_or_b32_e32 v4, v2, v0
-; GCN-NEXT: .LBB0_4: ; %exit
-; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: v_ashrrev_i32_e32 v0, 16, v4
-; GCN-NEXT: v_bfe_i32 v1, v4, 0, 16
-; GCN-NEXT: v_mov_b32_e32 v2, 0xffff
-; GCN-NEXT: v_mov_b32_e32 v3, 0x8000
-; GCN-NEXT: v_mov_b32_e32 v4, 0xffff8000
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -95,59 +39,9 @@ exit:
ret <2 x i16> %r2
}
+; GCN-LABEL: extract_2xi64
+; GCN-COUNT-2: v_cndmask_b32
define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_2xi64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT: s_cbranch_execz .LBB1_2
-; GCN-NEXT: ; %bb.1: ; %F
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ; implicit-def: $vgpr0
-; GCN-NEXT: .LBB1_2: ; %Flow
-; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT: s_cbranch_execz .LBB1_4
-; GCN-NEXT: ; %bb.3: ; %T
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: .LBB1_4: ; %exit
-; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
-; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7]
-; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v1, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, -1
-; GCN-NEXT: v_mov_b32_e32 v3, -1
-; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -166,65 +60,9 @@ exit:
ret <2 x i64> %r2
}
+; GCN-LABEL: extract_4xi64
+; GCN-COUNT-4: v_cndmask_b32
define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_4xi64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT: s_cbranch_execz .LBB2_2
-; GCN-NEXT: ; %bb.1: ; %F
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ; implicit-def: $vgpr0
-; GCN-NEXT: .LBB2_2: ; %Flow
-; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT: s_cbranch_execz .LBB2_4
-; GCN-NEXT: ; %bb.3: ; %T
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: .LBB2_4: ; %exit
-; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9]
-; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, -1
-; GCN-NEXT: v_mov_b32_e32 v3, -1
-; GCN-NEXT: v_mov_b32_e32 v5, -1
-; GCN-NEXT: v_mov_b32_e32 v7, -1
-; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -243,92 +81,9 @@ exit:
ret <4 x i64> %r2
}
+; GCN-LABEL: extract_8xi64
+; GCN-COUNT-8: v_cndmask_b32
define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_8xi64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT: s_cbranch_execz .LBB3_2
-; GCN-NEXT: ; %bb.1: ; %F
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ; implicit-def: $vgpr0
-; GCN-NEXT: .LBB3_2: ; %Flow
-; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT: s_cbranch_execz .LBB3_4
-; GCN-NEXT: ; %bb.3: ; %T
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: .LBB3_4: ; %exit
-; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[8:9]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, s[16:17]
-; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v8, v1, -1, s[8:9]
-; GCN-NEXT: v_cndmask_b32_e64 v10, v1, -1, s[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v12, v1, -1, s[12:13]
-; GCN-NEXT: v_cndmask_b32_e64 v14, v1, -1, s[14:15]
-; GCN-NEXT: v_mov_b32_e32 v1, -1
-; GCN-NEXT: v_mov_b32_e32 v3, -1
-; GCN-NEXT: v_mov_b32_e32 v5, -1
-; GCN-NEXT: v_mov_b32_e32 v7, -1
-; GCN-NEXT: v_mov_b32_e32 v9, -1
-; GCN-NEXT: v_mov_b32_e32 v11, -1
-; GCN-NEXT: v_mov_b32_e32 v13, -1
-; GCN-NEXT: v_mov_b32_e32 v15, -1
-; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -347,59 +102,9 @@ exit:
ret <8 x i64> %r2
}
+; GCN-LABEL: extract_2xf64
+; GCN-COUNT-2: v_cndmask_b32
define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_2xf64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT: s_cbranch_execz .LBB4_2
-; GCN-NEXT: ; %bb.1: ; %F
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ; implicit-def: $vgpr0
-; GCN-NEXT: .LBB4_2: ; %Flow
-; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT: s_cbranch_execz .LBB4_4
-; GCN-NEXT: ; %bb.3: ; %T
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: .LBB4_4: ; %exit
-; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
-; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v0, -2.0, vcc
-; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v3, v0, -2.0, vcc
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -418,65 +123,9 @@ exit:
ret <2 x double> %r2
}
+; GCN-LABEL: extract_4xf64
+; GCN-COUNT-4: v_cndmask_b32
define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_4xf64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT: s_cbranch_execz .LBB5_2
-; GCN-NEXT: ; %bb.1: ; %F
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ; implicit-def: $vgpr0
-; GCN-NEXT: .LBB5_2: ; %Flow
-; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT: s_cbranch_execz .LBB5_4
-; GCN-NEXT: ; %bb.3: ; %T
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: .LBB5_4: ; %exit
-; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
-; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v1, -2.0, v0, vcc
-; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
-; GCN-NEXT: v_cndmask_b32_e32 v3, -2.0, v0, vcc
-; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[8:9]
-; GCN-NEXT: v_cndmask_b32_e32 v5, -2.0, v0, vcc
-; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e32 v7, -2.0, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v6, 0
-; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -495,92 +144,9 @@ exit:
ret <4 x double> %r2
}
+; GCN-LABEL: extract_8xf64
+; GCN-COUNT-8: v_cndmask_b32
define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
-; GCN-LABEL: extract_8xf64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
-; GCN-NEXT: s_cbranch_execz .LBB6_2
-; GCN-NEXT: ; %bb.1: ; %F
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ; implicit-def: $vgpr0
-; GCN-NEXT: .LBB6_2: ; %Flow
-; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT: s_cbranch_execz .LBB6_4
-; GCN-NEXT: ; %bb.3: ; %T
-; GCN-NEXT: s_mov_b32 s10, 0
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s8, s10
-; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: .LBB6_4: ; %exit
-; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
-; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
-; GCN-NEXT: v_cmp_nlt_f64_e64 s[4:5], -1.0, v[8:9]
-; GCN-NEXT: v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11]
-; GCN-NEXT: v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13]
-; GCN-NEXT: v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17]
-; GCN-NEXT: v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19]
-; GCN-NEXT: v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v1, -2.0, v0, s[16:17]
-; GCN-NEXT: v_cndmask_b32_e32 v3, -2.0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v5, -2.0, v0, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v7, -2.0, v0, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v9, -2.0, v0, s[8:9]
-; GCN-NEXT: v_cndmask_b32_e64 v11, -2.0, v0, s[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v13, -2.0, v0, s[12:13]
-; GCN-NEXT: v_cndmask_b32_e64 v15, -2.0, v0, s[14:15]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v6, 0
-; GCN-NEXT: v_mov_b32_e32 v8, 0
-; GCN-NEXT: v_mov_b32_e32 v10, 0
-; GCN-NEXT: v_mov_b32_e32 v12, 0
-; GCN-NEXT: v_mov_b32_e32 v14, 0
-; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 952e89edeb799..08cf83fd2bd0f 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -255,13 +255,13 @@
; GCN-O1-NEXT: Function Alias Analysis Results
; GCN-O1-NEXT: Flatten the CFG
; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Cycle Info Analysis
+; GCN-O1-NEXT: Uniformity Analysis
+; GCN-O1-NEXT: AMDGPU IR late optimizations
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-NEXT: Function Alias Analysis Results
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Code sinking
-; GCN-O1-NEXT: Cycle Info Analysis
-; GCN-O1-NEXT: Uniformity Analysis
-; GCN-O1-NEXT: AMDGPU IR late optimizations
; GCN-O1-NEXT: Post-Dominator Tree Construction
; GCN-O1-NEXT: Unify divergent function exit nodes
; GCN-O1-NEXT: Dominator Tree Construction
@@ -552,13 +552,13 @@
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
; GCN-O1-OPTS-NEXT: Flatten the CFG
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
+; GCN-O1-OPTS-NEXT: Cycle Info Analysis
+; GCN-O1-OPTS-NEXT: Uniformity Analysis
+; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Code sinking
-; GCN-O1-OPTS-NEXT: Cycle Info Analysis
-; GCN-O1-OPTS-NEXT: Uniformity Analysis
-; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations
; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
@@ -861,13 +861,13 @@
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Flatten the CFG
; GCN-O2-NEXT: Dominator Tree Construction
+; GCN-O2-NEXT: Cycle Info Analysis
+; GCN-O2-NEXT: Uniformity Analysis
+; GCN-O2-NEXT: AMDGPU IR late optimizations
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Code sinking
-; GCN-O2-NEXT: Cycle Info Analysis
-; GCN-O2-NEXT: Uniformity Analysis
-; GCN-O2-NEXT: AMDGPU IR late optimizations
; GCN-O2-NEXT: Post-Dominator Tree Construction
; GCN-O2-NEXT: Unify divergent function exit nodes
; GCN-O2-NEXT: Dominator Tree Construction
@@ -1184,13 +1184,13 @@
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Flatten the CFG
; GCN-O3-NEXT: Dominator Tree Construction
+; GCN-O3-NEXT: Cycle Info Analysis
+; GCN-O3-NEXT: Uniformity Analysis
+; GCN-O3-NEXT: AMDGPU IR late optimizations
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Code sinking
-; GCN-O3-NEXT: Cycle Info Analysis
-; GCN-O3-NEXT: Uniformity Analysis
-; GCN-O3-NEXT: AMDGPU IR late optimizations
; GCN-O3-NEXT: Post-Dominator Tree Construction
; GCN-O3-NEXT: Unify divergent function exit nodes
; GCN-O3-NEXT: Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 911bb44078d51..0f2eedb1923d6 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2101,7 +2101,10 @@ define void @crash_lshlrevb16_not_reg_op() {
; NOSDWA: ; %bb.0: ; %bb0
; NOSDWA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; NOSDWA-NEXT: s_mov_b64 s[4:5], 0
-; NOSDWA-NEXT: v_mov_b32_e32 v0, 0x100
+; NOSDWA-NEXT: v_mov_b32_e32 v0, 0xff
+; NOSDWA-NEXT: v_and_b32_e32 v0, s4, v0
+; NOSDWA-NEXT: v_lshlrev_b16_e64 v1, 8, 1
+; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v1
; NOSDWA-NEXT: s_and_b64 vcc, exec, -1
; NOSDWA-NEXT: .LBB22_1: ; %bb1
; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2121,7 +2124,9 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX89: ; %bb.0: ; %bb0
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-NEXT: s_mov_b64 s[4:5], 0
-; GFX89-NEXT: v_mov_b32_e32 v0, 0x100
+; GFX89-NEXT: v_lshlrev_b16_e64 v0, 8, 1
+; GFX89-NEXT: v_mov_b32_e32 v1, s4
+; GFX89-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX89-NEXT: s_and_b64 vcc, exec, -1
; GFX89-NEXT: .LBB22_1: ; %bb1
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2141,7 +2146,8 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX9: ; %bb.0: ; %bb0
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x100
+; GFX9-NEXT: v_lshlrev_b16_e64 v0, 8, 1
+; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_and_b64 vcc, exec, -1
; GFX9-NEXT: .LBB22_1: ; %bb1
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2160,16 +2166,18 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
; GFX10: ; %bb.0: ; %bb0
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b64 s[4:5], 0
+; GFX10-NEXT: v_lshlrev_b16 v0, 8, 1
; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo
+; GFX10-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: .LBB22_1: ; %bb1
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_lshl_b32 s6, s4, 3
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: v_lshrrev_b16 v2, s6, 0x100
+; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: v_mov_b32_e32 v2, s5
+; GFX10-NEXT: v_lshrrev_b16 v3, s6, v0
; GFX10-NEXT: s_mov_b64 s[4:5], 1
-; GFX10-NEXT: flat_store_byte v[0:1], v2
+; GFX10-NEXT: flat_store_byte v[1:2], v3
; GFX10-NEXT: s_cbranch_vccnz .LBB22_1
; GFX10-NEXT: ; %bb.2: ; %DummyReturnBlock
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 2355fa7870ea8..f78b408d78255 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
@@ -6,31 +6,27 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT: v_mov_b32_e32 v3, 8
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 2, v0
; GFX906-NEXT: v_mov_b32_e32 v1, 0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v4, v2, s[4:5]
-; GFX906-NEXT: s_mov_b32 s4, 0xff0000
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dword v2, v5, s[4:5]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB0_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v0, v2, s[6:7]
+; GFX906-NEXT: global_load_dword v2, v5, s[6:7]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2
; GFX906-NEXT: .LBB0_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[2:3] offset:2
-; GFX906-NEXT: global_store_short v1, v4, s[2:3]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_byte v1, v3, s[2:3] offset:2
+; GFX906-NEXT: global_store_short v1, v0, s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -54,19 +50,31 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX906-NEXT: v_mov_b32_e32 v1, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v2, v3, s[4:5]
+; GFX906-NEXT: global_load_dword v2, v6, s[4:5]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB1_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v2, v3, s[6:7]
+; GFX906-NEXT: global_load_dword v2, v6, s[6:7]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX906-NEXT: .LBB1_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dword v1, v2, s[2:3]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v5
+; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dword v1, v0, s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -90,23 +98,32 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX906-NEXT: v_mov_b32_e32 v3, 0
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 3, v0
+; GFX906-NEXT: v_mov_b32_e32 v5, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[4:5]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB2_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[6:7]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX906-NEXT: .LBB2_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: global_store_byte v3, v2, s[2:3] offset:4
-; GFX906-NEXT: global_store_dword v3, v1, s[2:3]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v6
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_byte v5, v2, s[2:3] offset:4
+; GFX906-NEXT: global_store_dword v5, v0, s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -130,19 +147,42 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 3, v0
; GFX906-NEXT: v_mov_b32_e32 v3, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[4:5]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB3_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[6:7]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX906-NEXT: .LBB3_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v9
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7
+; GFX906-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v6
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4
+; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -166,19 +206,64 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v18, 4, v0
; GFX906-NEXT: v_mov_b32_e32 v5, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[4:5]
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[4:5]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB4_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[6:7]
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[6:7]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1
; GFX906-NEXT: .LBB4_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[2:3]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v17
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15
+; GFX906-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v12
+; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v11
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v9
+; GFX906-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v8
+; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v6
+; GFX906-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v5, v[0:3], s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -201,24 +286,114 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-LABEL: v32i8_liveout:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v31, 5, v0
+; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX906-NEXT: v_mov_b32_e32 v9, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[4:5] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[4:5]
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[4:5] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[4:5]
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v18, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
+; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX906-NEXT: s_cbranch_execz .LBB5_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[6:7] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[6:7]
-; GFX906-NEXT: .LBB5_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[2:3] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[6:7] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[6:7]
; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[2:3]
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v18, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
+; GFX906-NEXT: .LBB5_2: ; %bb.2
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30
+; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v33
+; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT: v_lshlrev_b16_e32 v27, 8, v27
+; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26
+; GFX906-NEXT: v_lshlrev_b16_e32 v24, 8, v24
+; GFX906-NEXT: v_lshlrev_b16_e32 v23, 8, v23
+; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v20
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v18
+; GFX906-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v17
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v15
+; GFX906-NEXT: v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v14
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v12
+; GFX906-NEXT: v_or_b32_sdwa v5, v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v11
+; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -240,595 +415,1572 @@ bb.2:
define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
; GFX906-LABEL: v256i8_liveout:
; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v61, 3, v0
; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX906-NEXT: s_mov_b32 s10, -1
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:240
; GFX906-NEXT: s_mov_b32 s11, 0xe00000
; GFX906-NEXT: s_add_u32 s8, s8, s3
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT: v_lshlrev_b32_e32 v63, 3, v0
; GFX906-NEXT: s_addc_u32 s9, s9, 0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[4:5] offset:224
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[4:5] offset:208
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[4:5] offset:192
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX906-NEXT: v_mov_b32_e32 v4, 0
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 ; 4-byte Folded Spill
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:224
+; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:176
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[4:5] offset:160
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[4:5] offset:144
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[4:5] offset:128
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[4:5] offset:112
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[4:5] offset:96
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[4:5] offset:80
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[4:5] offset:64
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[4:5] offset:48
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[4:5] offset:32
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[4:5] offset:16
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[4:5] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[4:5] offset:192
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[4:5] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[4:5] offset:160
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[4:5] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[4:5] offset:128
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[4:5] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[4:5] offset:96
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[4:5] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[4:5] offset:64
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[4:5] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[4:5] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[4:5] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[4:5]
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[4:5]
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1
+; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1
+; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] offset:240
-; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[6:7] offset:224
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[6:7] offset:208
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[6:7] offset:192
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v3
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v3
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v3
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v2
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v2
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v1
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v1
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v0
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[6:7] offset:224
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[6:7] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[6:7] offset:192
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[6:7] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[6:7] offset:160
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[6:7] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[6:7] offset:128
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[6:7] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[6:7] offset:96
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[6:7] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[6:7] offset:64
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[6:7] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[6:7] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[6:7] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7]
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v0
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[6:7] offset:176
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[6:7] offset:160
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[6:7] offset:144
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[6:7] offset:128
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[6:7] offset:112
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[6:7] offset:96
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[6:7] offset:80
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[6:7] offset:64
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[6:7] offset:48
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[6:7] offset:32
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[6:7] offset:16
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7]
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1
+; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0
+; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1
+; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0
; GFX906-NEXT: .LBB6_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:96
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[0:1] offset:80
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[0:1] offset:64
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[0:1] offset:48
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[0:1] offset:32
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:16
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62
+; GFX906-NEXT: v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240
-; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] offset:224
-; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[0:1] offset:208
-; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[0:1] offset:192
-; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[0:1] offset:176
-; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[0:1] offset:160
-; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[0:1] offset:144
-; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[0:1] offset:128
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: repeat_successor:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dword s8, s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_cmp_lt_i32 s8, 3
-; GFX906-NEXT: s_cbranch_scc0 .LBB7_3
-; GFX906-NEXT: ; %bb.1: ; %LeafBlock
-; GFX906-NEXT: s_cmp_gt_i32 s8, 0
-; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
-; GFX906-NEXT: ; %bb.2:
-; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX906-NEXT: global_load_dword v0, v0, s[4:5]
-; GFX906-NEXT: s_branch .LBB7_5
-; GFX906-NEXT: .LBB7_3: ; %LeafBlock5
-; GFX906-NEXT: s_cmp_eq_u32 s8, 3
-; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
-; GFX906-NEXT: ; %bb.4: ; %sw.bb5
-; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX906-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX906-NEXT: .LBB7_5: ; %return.sink.split
-; GFX906-NEXT: v_mov_b32_e32 v1, 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v58, 8, v58
+; GFX906-NEXT: v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX906-NEXT: .LBB7_6: ; %return
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
- switch i32 %in, label %return [
- i32 1, label %return.sink.split
- i32 2, label %return.sink.split
- i32 3, label %sw.bb5
- ]
-
-sw.bb5:
- br label %return.sink.split
-
-return.sink.split:
- %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
- store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-
-return:
- ret void
-}
-
-define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_phi_chain:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
-; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
-; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB8_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3]
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT: s_and_b64 s[2:3], vcc, exec
-; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX906-NEXT: .LBB8_2: ; %Flow
-; GFX906-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT: s_cbranch_execz .LBB8_4
-; GFX906-NEXT: ; %bb.3: ; %bb.2
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5]
-; GFX906-NEXT: .LBB8_4: ; %bb.3
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- %cmp2 = icmp ult i32 %idx, 7
- br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
- %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
- br label %bb.3
-
-bb.3:
- %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
- store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
- ret void
-}
-
-
-define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_phi_zeroinit:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1]
-; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
-; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB9_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[2:3]
-; GFX906-NEXT: s_mov_b32 s2, 0
-; GFX906-NEXT: s_mov_b32 s3, s2
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_mov_b32_e32 v4, s3
-; GFX906-NEXT: v_mov_b32_e32 v3, s2
-; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT: s_and_b64 s[2:3], vcc, exec
-; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX906-NEXT: .LBB9_2: ; %Flow
-; GFX906-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT: s_cbranch_execz .LBB9_4
-; GFX906-NEXT: ; %bb.3: ; %bb.2
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v1, v3
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: v_mov_b32_e32 v2, v4
-; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5]
-; GFX906-NEXT: .LBB9_4: ; %bb.3
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v54, 8, v54
+; GFX906-NEXT: v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- %cmp2 = icmp ult i32 %idx, 7
- br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
- %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ zeroinitializer, %bb.1 ]
- store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
- br label %bb.3
-
-bb.3:
- %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
- store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
- ret void
-}
-
-define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_phi_const:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
-; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB10_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX906-NEXT: v_mov_b32_e32 v1, 1
-; GFX906-NEXT: v_mov_b32_e32 v8, 2
-; GFX906-NEXT: v_mov_b32_e32 v6, 3
-; GFX906-NEXT: v_mov_b32_e32 v7, 4
-; GFX906-NEXT: v_mov_b32_e32 v2, 5
-; GFX906-NEXT: v_mov_b32_e32 v5, 6
-; GFX906-NEXT: v_mov_b32_e32 v3, 7
-; GFX906-NEXT: v_mov_b32_e32 v4, 8
-; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX906-NEXT: .LBB10_2: ; %Flow
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT: s_cbranch_execz .LBB10_4
-; GFX906-NEXT: ; %bb.3: ; %bb.2
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v8
-; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7
-; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v50, 8, v50
+; GFX906-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v46, 8, v46
+; GFX906-NEXT: v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:64
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v42, 8, v42
+; GFX906-NEXT: v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:80
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v38, 8, v38
+; GFX906-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:96
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX906-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:112
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30
+; GFX906-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:128
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26
+; GFX906-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:144
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v22, 8, v22
+; GFX906-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:160
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v18, 8, v18
+; GFX906-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:176
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v13, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v13
+; GFX906-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:192
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v5
-; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_mov_b32_e32 v9, 0
; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx2 v9, v[0:1], s[4:5]
-; GFX906-NEXT: .LBB10_4: ; %bb.3
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- %cmp2 = icmp ult i32 %idx, 7
- br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
- %tmp5 = phi <8 x i8> [ %vec1, %entry ], [<i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, %bb.1 ]
- store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
- br label %bb.3
-
-bb.3:
- %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
- store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
- ret void
-}
-
-define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: v8i8_multi_block:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0
-; GFX906-NEXT: v_mov_b32_e32 v5, 0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[0:1]
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v1, v3
-; GFX906-NEXT: v_mov_b32_e32 v2, v4
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB11_4
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[2:3]
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB11_3
-; GFX906-NEXT: ; %bb.2: ; %bb.2
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5]
-; GFX906-NEXT: .LBB11_3: ; %Flow
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: .LBB11_4: ; %bb.3
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: v_lshlrev_b16_e32 v9, 8, v9
+; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:208
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[6:7]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.3
-bb.1:
- %cmp2 = icmp ult i32 %idx, 7
- br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
- store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
- br label %bb.3
-
-bb.3:
- %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
- store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
- ret void
-}
-
-define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v32i8_loop_carried:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0
-; GFX906-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0
-; GFX906-NEXT: s_mov_b32 s4, 0x2000604
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v1, v1, s[2:3]
-; GFX906-NEXT: s_mov_b64 s[2:3], 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v0, v1
-; GFX906-NEXT: .LBB12_1: ; %bb.1
-; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc
-; GFX906-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
-; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX906-NEXT: s_cbranch_execnz .LBB12_1
-; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX906-NEXT: v_mov_b32_e32 v1, 0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
- br label %bb.1
-
-bb.1:
- %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
- br label %bb.2
-
-bb.2:
- store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-; Should not have instances of "Instruction does not dominate all uses!"
-
-define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) {
-; GFX906-LABEL: v8i8_multiuse_multiblock:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[4:5]
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX906-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB13_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: s_movk_i32 s6, 0xff00
-; GFX906-NEXT: v_mov_b32_e32 v5, 8
-; GFX906-NEXT: v_and_b32_sdwa v6, v1, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: s_mov_b32 s6, 0x6070504
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
-; GFX906-NEXT: v_lshlrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX906-NEXT: v_perm_b32 v7, v1, v1, s6
-; GFX906-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
-; GFX906-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX906-NEXT: v_mov_b32_e32 v3, 0
-; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v6, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
-; GFX906-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dword v3, v1, s[8:9]
-; GFX906-NEXT: global_store_dword v3, v7, s[8:9] offset:8
-; GFX906-NEXT: global_store_dword v3, v6, s[8:9] offset:16
-; GFX906-NEXT: global_store_dword v3, v4, s[8:9] offset:24
-; GFX906-NEXT: .LBB13_2: ; %Flow
-; GFX906-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX906-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
-; GFX906-NEXT: s_cbranch_execz .LBB13_4
-; GFX906-NEXT: ; %bb.3: ; %bb.2
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v2
-; GFX906-NEXT: v_and_b32_e32 v4, 0xffffff00, v2
-; GFX906-NEXT: v_and_b32_e32 v5, 0xffffff00, v1
-; GFX906-NEXT: s_mov_b32 s2, 0xc0c0001
-; GFX906-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_perm_b32 v2, 0, v2, s2
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_perm_b32 v6, 0, v1, s2
-; GFX906-NEXT: s_mov_b32 s3, 0xffff0000
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT: v_and_or_b32 v7, v1, s3, v6
-; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX906-NEXT: global_store_dword v0, v3, s[10:11]
-; GFX906-NEXT: global_store_dword v0, v4, s[10:11] offset:8
-; GFX906-NEXT: global_store_dword v0, v7, s[10:11] offset:16
-; GFX906-NEXT: global_store_dword v0, v2, s[10:11] offset:24
-; GFX906-NEXT: .LBB13_4: ; %bb.3
-; GFX906-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX906-NEXT: s_movk_i32 s3, 0xff00
-; GFX906-NEXT: v_mov_b32_e32 v4, 8
-; GFX906-NEXT: s_movk_i32 s2, 0xff
-; GFX906-NEXT: v_and_b32_sdwa v2, v1, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX906-NEXT: v_or_b32_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v5, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v6, 8, v1
-; GFX906-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v7, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dword v0, v3, s[0:1]
-; GFX906-NEXT: global_store_dword v0, v1, s[0:1] offset:8
-; GFX906-NEXT: global_store_dword v0, v4, s[0:1] offset:16
-; GFX906-NEXT: global_store_dword v0, v2, s[0:1] offset:24
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
%gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
%cmp = icmp ult i32 %idx, 15
br i1 %cmp, label %bb.1, label %bb.2
bb.1:
- %s1 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %s2 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
- %s3 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
- %s4 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
- %gep4 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 0
- %gep5 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 1
- %gep6 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 2
- %gep7 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 3
- store <4 x i8> %s1, ptr addrspace(1) %gep4, align 4
- store <4 x i8> %s2, ptr addrspace(1) %gep5, align 4
- store <4 x i8> %s3, ptr addrspace(1) %gep6, align 4
- store <4 x i8> %s4, ptr addrspace(1) %gep7, align 4
- %cmp2 = icmp ult i32 %idx, 7
- br i1 %cmp2, label %bb.2, label %bb.3
+ br label %bb.2
bb.2:
- %s5 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
- %s6 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
- %s7 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
- %s8 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
- %gep8 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 0
- %gep9 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 1
- %gep10 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 2
- %gep11 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 3
- store <4 x i8> %s5, ptr addrspace(1) %gep8, align 4
- store <4 x i8> %s6, ptr addrspace(1) %gep9, align 4
- store <4 x i8> %s7, ptr addrspace(1) %gep10, align 4
- store <4 x i8> %s8, ptr addrspace(1) %gep11, align 4
- br label %bb.3
-
-bb.3:
- %s9 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
- %s10 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
- %s11 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
- %s12 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
- %gep12 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 0
- %gep13 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 1
- %gep14 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 2
- %gep15 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 3
- store <4 x i8> %s9, ptr addrspace(1) %gep12, align 4
- store <4 x i8> %s10, ptr addrspace(1) %gep13, align 4
- store <4 x i8> %s11, ptr addrspace(1) %gep14, align 4
- store <4 x i8> %s12, ptr addrspace(1) %gep15, align 4
+ %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
ret void
}
-
declare i32 @llvm.amdgcn.workitem.id.x()
+
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
deleted file mode 100644
index 5d2e299aa854a..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
+++ /dev/null
@@ -1,352 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -mtriple=amdgcn -mcpu=gfx906 -amdgpu-late-codegenprepare -S -o - %s | FileCheck --check-prefix=GFX906 %s
-
-define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @v3i8_liveout(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
-; GFX906-NEXT: entry:
-; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT: [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
-; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4
-; GFX906-NEXT: [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32
-; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
-; GFX906: bb.1:
-; GFX906-NEXT: br label [[BB_2]]
-; GFX906: bb.2:
-; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP5_TC]] to i24
-; GFX906-NEXT: [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
-; GFX906-NEXT: store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT: ret void
-;
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @v4i8_liveout(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT: entry:
-; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
-; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
-; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
-; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
-; GFX906: bb.1:
-; GFX906-NEXT: br label [[BB_2]]
-; GFX906: bb.2:
-; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
-; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT: ret void
-;
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @v5i8_liveout(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT: entry:
-; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8
-; GFX906-NEXT: [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
-; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8
-; GFX906-NEXT: [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
-; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
-; GFX906: bb.1:
-; GFX906-NEXT: br label [[BB_2]]
-; GFX906: bb.2:
-; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
-; GFX906-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
-; GFX906-NEXT: store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT: ret void
-;
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @v8i8_liveout(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT: entry:
-; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
-; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
-; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
-; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
-; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
-; GFX906: bb.1:
-; GFX906-NEXT: br label [[BB_2]]
-; GFX906: bb.2:
-; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
-; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT: ret void
-;
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @repeat_successor(
-; GFX906-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT: entry:
-; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
-; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
-; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
-; GFX906-NEXT: switch i32 [[IN]], label [[RETURN:%.*]] [
-; GFX906-NEXT: i32 1, label [[RETURN_SINK_SPLIT:%.*]]
-; GFX906-NEXT: i32 2, label [[RETURN_SINK_SPLIT]]
-; GFX906-NEXT: i32 3, label [[SW_BB5:%.*]]
-; GFX906-NEXT: ]
-; GFX906: sw.bb5:
-; GFX906-NEXT: br label [[RETURN_SINK_SPLIT]]
-; GFX906: return.sink.split:
-; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ]
-; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
-; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT: ret void
-; GFX906: return:
-; GFX906-NEXT: ret void
-;
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
- switch i32 %in, label %return [
- i32 1, label %return.sink.split
- i32 2, label %return.sink.split
- i32 3, label %sw.bb5
- ]
-
-sw.bb5:
- br label %return.sink.split
-
-return.sink.split:
- %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
- store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-
-return:
- ret void
-}
-
-define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: define amdgpu_kernel void @v8i8_phi_chain(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT: entry:
-; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
-; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
-; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
-; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
-; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
-; GFX906: bb.1:
-; GFX906-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
-; GFX906-NEXT: br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]]
-; GFX906: bb.2:
-; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
-; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
-; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST0]], align 4
-; GFX906-NEXT: br label [[BB_3]]
-; GFX906: bb.3:
-; GFX906-NEXT: [[TMP7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[TMP5_TC]], [[BB_2]] ]
-; GFX906-NEXT: [[TMP7_TC_BC:%.*]] = bitcast <2 x i32> [[TMP7_TC]] to <8 x i8>
-; GFX906-NEXT: store <8 x i8> [[TMP7_TC_BC]], ptr addrspace(1) [[DST1]], align 4
-; GFX906-NEXT: ret void
-;
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- %cmp2 = icmp ult i32 %idx, 7
- br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
- %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
- br label %bb.3
-
-bb.3:
- %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
- store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
- ret void
-}
-
-define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
-; GFX906-LABEL: define amdgpu_kernel void @v8i8_multi_block(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT: entry:
-; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
-; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
-; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
-; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
-; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_3:%.*]]
-; GFX906: bb.1:
-; GFX906-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
-; GFX906-NEXT: br i1 [[CMP2]], label [[BB_2:%.*]], label [[BB_3]]
-; GFX906: bb.2:
-; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <2 x i32> [[VEC1_BC]] to <8 x i8>
-; GFX906-NEXT: store <8 x i8> [[VEC1_BC_BC]], ptr addrspace(1) [[DST0]], align 4
-; GFX906-NEXT: br label [[BB_3]]
-; GFX906: bb.3:
-; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
-; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
-; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST1]], align 4
-; GFX906-NEXT: ret void
-;
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.3
-bb.1:
- %cmp2 = icmp ult i32 %idx, 7
- br i1 %cmp2, label %bb.2, label %bb.3
-
-bb.2:
- store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
- br label %bb.3
-
-bb.3:
- %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
- store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
- ret void
-}
-
-define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: define amdgpu_kernel void @v32i8_loop_carried(
-; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
-; GFX906-NEXT: entry:
-; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
-; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
-; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
-; GFX906-NEXT: br label [[BB_1:%.*]]
-; GFX906: bb.1:
-; GFX906-NEXT: [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ]
-; GFX906-NEXT: [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8>
-; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8>
-; GFX906-NEXT: [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; GFX906-NEXT: [[VEC2_BC]] = bitcast <4 x i8> [[VEC2]] to i32
-; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
-; GFX906-NEXT: br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]]
-; GFX906: 0:
-; GFX906-NEXT: br label [[BB_2]]
-; GFX906: bb.2:
-; GFX906-NEXT: [[VEC2_BC_BC:%.*]] = bitcast i32 [[VEC2_BC]] to <4 x i8>
-; GFX906-NEXT: store <4 x i8> [[VEC2_BC_BC]], ptr addrspace(1) [[DST]], align 4
-; GFX906-NEXT: ret void
-;
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
- br label %bb.1
-
-bb.1:
- %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
- br label %bb.2
-
-bb.2:
- store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list