[llvm] [AMDGPU] Add IR LiveReg type-based optimization (PR #66838)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 2 15:05:50 PDT 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/66838
>From 5b9dff43ba9174a4e94e7cdc14729dfb99932587 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 14 Sep 2023 12:20:06 -0700
Subject: [PATCH 1/3] [AMDGPU] Add IR LiveReg type-based optimization
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 295 ++-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +-
.../AMDGPU/GlobalISel/vni8-across-blocks.ll | 636 +++++
.../amdgpu-codegenprepare-break-large-phis.ll | 8 +-
...dagcomb-extract-vec-elt-different-sizes.ll | 39 +-
.../CodeGen/AMDGPU/extract-subvector-16bit.ll | 359 ++-
llvm/test/CodeGen/AMDGPU/extract-subvector.ll | 498 +++-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 24 +-
llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 24 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 2354 +++++------------
llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll | 352 +++
11 files changed, 2565 insertions(+), 2028 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 69fdeaebe0a01..7623b73d6dd5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -81,6 +81,73 @@ class AMDGPULateCodeGenPrepare
bool visitLoadInst(LoadInst &LI);
};
+using ValueToValueMap = DenseMap<const Value *, Value *>;
+
+class LiveRegOptimizer {
+private:
+ Module *Mod = nullptr;
+ const DataLayout *DL = nullptr;
+ const GCNSubtarget *ST;
+ /// The scalar type to convert to
+ Type *ConvertToScalar;
+ /// The set of visited Instructions
+ SmallPtrSet<Instruction *, 4> Visited;
+ /// The set of Instructions to be deleted
+ SmallPtrSet<Instruction *, 4> DeadInstrs;
+ /// Map of Value -> Converted Value
+ ValueToValueMap ValMap;
+ /// Map of containing conversions from Optimal Type -> Original Type per BB.
+ DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
+
+public:
+ /// Calculate the and \p return the type to convert to given a problematic \p
+ /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
+ Type *calculateConvertType(Type *OriginalType);
+ /// Convert the virtual register defined by \p V to the compatible vector of
+ /// legal type
+ Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
+ /// Convert the virtual register defined by \p V back to the original type \p
+ /// ConvertType, stripping away the MSBs in cases where there was an imperfect
+ /// fit (e.g. v2i32 -> v7i8)
+ Value *convertFromOptType(Type *ConvertType, Instruction *V,
+ BasicBlock::iterator &InstPt,
+ BasicBlock *InsertBlock);
+ /// Check for problematic PHI nodes or cross-bb values based on the value
+ /// defined by \p I, and coerce to legal types if necessary. For problematic
+ /// PHI node, we coerce all incoming values in a single invocation.
+ bool optimizeLiveType(Instruction *I);
+
+ /// Remove all instructions that have become dead (i.e. all the re-typed PHIs)
+ void removeDeadInstrs();
+
+ // Whether or not the type should be replaced to avoid inefficient
+ // legalization code
+ bool shouldReplace(Type *ITy) {
+ FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
+ if (!VTy)
+ return false;
+
+ auto TLI = ST->getTargetLowering();
+
+ Type *EltTy = VTy->getElementType();
+ // If the element size is not less than the convert to scalar size, then we
+ // can't do any bit packing
+ if (!EltTy->isIntegerTy() ||
+ EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
+ return false;
+
+ // Only coerce illegal types
+ TargetLoweringBase::LegalizeKind LK =
+ TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
+ return LK.first != TargetLoweringBase::TypeLegal;
+ }
+
+ LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
+ DL = &Mod->getDataLayout();
+ ConvertToScalar = Type::getInt32Ty(Mod->getContext());
+ }
+};
+
} // end anonymous namespace
bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
@@ -102,14 +169,238 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ // "Optimize" the virtual regs that cross basic block boundaries. When
+ // building the SelectionDAG, vectors of illegal types that cross basic blocks
+ // will be scalarized and widened, with each scalar living in its
+ // own register. To work around this, this optimization converts the
+ // vectors to equivalent vectors of legal type (which are converted back
+ // before uses in subsequent blocks), to pack the bits into fewer physical
+ // registers (used in CopyToReg/CopyFromReg pairs).
+ LiveRegOptimizer LRO(Mod, &ST);
+
bool Changed = false;
+
for (auto &BB : F)
- for (Instruction &I : llvm::make_early_inc_range(BB))
+ for (Instruction &I : make_early_inc_range(BB)) {
Changed |= visit(I);
+ Changed |= LRO.optimizeLiveType(&I);
+ }
+ LRO.removeDeadInstrs();
return Changed;
}
+Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
+ assert(OriginalType->getScalarSizeInBits() <=
+ ConvertToScalar->getScalarSizeInBits());
+
+ FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);
+
+ TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
+ TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
+ unsigned ConvertEltCount =
+ (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
+
+ if (OriginalSize <= ConvertScalarSize)
+ return IntegerType::get(Mod->getContext(), ConvertScalarSize);
+
+ return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
+ ConvertEltCount, false);
+}
+
+Value *LiveRegOptimizer::convertToOptType(Instruction *V,
+ BasicBlock::iterator &InsertPt) {
+ FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
+ Type *NewTy = calculateConvertType(V->getType());
+
+ TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
+ TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
+
+ IRBuilder<> Builder(V->getParent(), InsertPt);
+ // If there is a bitsize match, we can fit the old vector into a new vector of
+ // desired type.
+ if (OriginalSize == NewSize)
+ return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");
+
+ // If there is a bitsize mismatch, we must use a wider vector.
+ assert(NewSize > OriginalSize);
+ uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
+
+ SmallVector<int, 8> ShuffleMask;
+ uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
+ for (unsigned I = 0; I < OriginalElementCount; I++)
+ ShuffleMask.push_back(I);
+
+ for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
+ ShuffleMask.push_back(OriginalElementCount);
+
+ Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
+ return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
+}
+
+Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
+ BasicBlock::iterator &InsertPt,
+ BasicBlock *InsertBB) {
+ FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);
+
+ TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
+ TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
+
+ IRBuilder<> Builder(InsertBB, InsertPt);
+ // If there is a bitsize match, we simply convert back to the original type.
+ if (OriginalSize == NewSize)
+ return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");
+
+ // If there is a bitsize mismatch, then we must have used a wider value to
+ // hold the bits.
+ assert(OriginalSize > NewSize);
+ // For wide scalars, we can just truncate the value.
+ if (!V->getType()->isVectorTy()) {
+ Instruction *Trunc = cast<Instruction>(
+ Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
+ return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
+ }
+
+ // For wider vectors, we must strip the MSBs to convert back to the original
+ // type.
+ VectorType *ExpandedVT = VectorType::get(
+ Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
+ (OriginalSize / NewVTy->getScalarSizeInBits()), false);
+ Instruction *Converted =
+ cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
+
+ unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+ SmallVector<int, 8> ShuffleMask(NarrowElementCount);
+ std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
+
+ return Builder.CreateShuffleVector(Converted, ShuffleMask);
+}
+
+bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
+ SmallVector<Instruction *, 4> Worklist;
+ SmallPtrSet<PHINode *, 4> PhiNodes;
+ SmallPtrSet<Instruction *, 4> Defs;
+ SmallPtrSet<Instruction *, 4> Uses;
+
+ Worklist.push_back(cast<Instruction>(I));
+ while (!Worklist.empty()) {
+ Instruction *II = Worklist.pop_back_val();
+
+ if (!Visited.insert(II).second)
+ continue;
+
+ if (!shouldReplace(II->getType()))
+ continue;
+
+ if (PHINode *Phi = dyn_cast<PHINode>(II)) {
+ PhiNodes.insert(Phi);
+ // Collect all the incoming values of problematic PHI nodes.
+ for (Value *V : Phi->incoming_values()) {
+ // Repeat the collection process for newly found PHI nodes.
+ if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
+ if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
+ Worklist.push_back(OpPhi);
+ continue;
+ }
+
+ Instruction *IncInst = dyn_cast<Instruction>(V);
+ // Other incoming value types (e.g. vector literals) are unhandled
+ if (!IncInst && !isa<ConstantAggregateZero>(V))
+ return false;
+
+ // Collect all other incoming values for coercion.
+ if (IncInst)
+ Defs.insert(IncInst);
+ }
+ }
+
+ // Collect all relevant uses.
+ for (User *V : II->users()) {
+ // Repeat the collection process for problematic PHI nodes.
+ if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
+ if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
+ Worklist.push_back(OpPhi);
+ continue;
+ }
+
+ Instruction *UseInst = cast<Instruction>(V);
+ // Collect all uses of PHINodes and any use the crosses BB boundaries.
+ if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
+ Uses.insert(UseInst);
+ if (!Defs.count(II) && !isa<PHINode>(II)) {
+ Defs.insert(II);
+ }
+ }
+ }
+ }
+
+ // Coerce and track the defs.
+ for (Instruction *D : Defs) {
+ if (!ValMap.contains(D)) {
+ BasicBlock::iterator InsertPt = std::next(D->getIterator());
+ Value *ConvertVal = convertToOptType(D, InsertPt);
+ assert(ConvertVal);
+ ValMap[D] = ConvertVal;
+ }
+ }
+
+ // Construct new-typed PHI nodes.
+ for (PHINode *Phi : PhiNodes) {
+ ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
+ Phi->getNumIncomingValues(),
+ Phi->getName() + ".tc", Phi->getIterator());
+ }
+
+ // Connect all the PHI nodes with their new incoming values.
+ for (PHINode *Phi : PhiNodes) {
+ PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
+ bool MissingIncVal = false;
+ for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
+ Value *IncVal = Phi->getIncomingValue(I);
+ if (isa<ConstantAggregateZero>(IncVal)) {
+ Type *NewType = calculateConvertType(Phi->getType());
+ NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
+ Phi->getIncomingBlock(I));
+ } else if (ValMap.contains(IncVal))
+ NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
+ else
+ MissingIncVal = true;
+ }
+ DeadInstrs.insert(MissingIncVal ? cast<Instruction>(ValMap[Phi]) : Phi);
+ }
+ // Coerce back to the original type and replace the uses.
+ for (Instruction *U : Uses) {
+ // Replace all converted operands for a use.
+ for (auto [OpIdx, Op] : enumerate(U->operands())) {
+ if (ValMap.contains(Op)) {
+ Value *NewVal = nullptr;
+ if (BBUseValMap.contains(U->getParent()) &&
+ BBUseValMap[U->getParent()].contains(ValMap[Op]))
+ NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
+ else {
+ BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
+ NewVal =
+ convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
+ InsertPt, U->getParent());
+ BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
+ }
+ assert(NewVal);
+ U->setOperand(OpIdx, NewVal);
+ }
+ }
+ }
+
+ return true;
+}
+
+void LiveRegOptimizer::removeDeadInstrs() {
+ // Remove instrs that have been marked dead after type-coercion.
+ for (auto *I : DeadInstrs) {
+ I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+ I->eraseFromParent();
+ }
+}
+
bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
unsigned AS = LI.getPointerAddressSpace();
// Skip non-constant address space.
@@ -119,7 +410,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
// Skip non-simple loads.
if (!LI.isSimple())
return false;
- auto *Ty = LI.getType();
+ Type *Ty = LI.getType();
// Skip aggregate types.
if (Ty->isAggregateType())
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 305a6c8c3b926..c15481336075e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1150,10 +1150,10 @@ bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
if (TM->getOptLevel() > CodeGenOptLevel::None)
- addPass(createAMDGPULateCodeGenPreparePass());
+ addPass(createSinkingPass());
if (TM->getOptLevel() > CodeGenOptLevel::None)
- addPass(createSinkingPass());
+ addPass(createAMDGPULateCodeGenPreparePass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
new file mode 100644
index 0000000000000..83cb92210ec84
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -0,0 +1,636 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+
+define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v3i8_liveout:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX906-NEXT: v_mov_b32_e32 v3, 8
+; GFX906-NEXT: v_mov_b32_e32 v5, 16
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dword v4, v2, s[4:5]
+; GFX906-NEXT: v_mov_b32_e32 v1, 0xff
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v4
+; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB0_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dword v0, v2, s[6:7]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0
+; GFX906-NEXT: .LBB0_2: ; %bb.2
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4
+; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT: v_mov_b32_e32 v1, 0
+; GFX906-NEXT: global_store_short v1, v0, s[2:3]
+; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[2:3] offset:2
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v4i8_liveout:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dword v1, v2, s[4:5]
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB1_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dword v1, v2, s[6:7]
+; GFX906-NEXT: .LBB1_2: ; %bb.2
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v5i8_liveout:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB2_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: .LBB2_2: ; %bb.2
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: v_mov_b32_e32 v4, 0
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1
+; GFX906-NEXT: global_store_byte v4, v1, s[2:3]
+; GFX906-NEXT: global_store_byte v4, v0, s[2:3] offset:1
+; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[2:3] offset:2
+; GFX906-NEXT: global_store_byte v4, v3, s[2:3] offset:3
+; GFX906-NEXT: global_store_byte v4, v2, s[2:3] offset:4
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v8i8_liveout:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB3_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
+; GFX906-NEXT: .LBB3_2: ; %bb.2
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3]
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v16i8_liveout:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5]
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB4_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7]
+; GFX906-NEXT: .LBB4_2: ; %bb.2
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3]
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <16 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <16 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <16 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <16 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_liveout:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5]
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB5_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7]
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16
+; GFX906-NEXT: .LBB5_2: ; %bb.2
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[2:3] offset:16
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <32 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <32 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <32 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <32 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <32 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v256i8_liveout:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX906-NEXT: s_mov_b32 s10, -1
+; GFX906-NEXT: s_mov_b32 s11, 0xe00000
+; GFX906-NEXT: s_add_u32 s8, s8, s3
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX906-NEXT: s_addc_u32 s9, s9, 0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] offset:16
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[4:5] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[4:5] offset:48
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[4:5] offset:64
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[4:5] offset:80
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[4:5] offset:96
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[4:5] offset:112
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[4:5] offset:128
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[4:5] offset:144
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[4:5] offset:160
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[4:5] offset:176
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[4:5] offset:192
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[4:5] offset:208
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[4:5] offset:224
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] offset:240
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB6_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[6:7] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[6:7] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[6:7] offset:48
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[6:7] offset:64
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[6:7] offset:80
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[6:7] offset:96
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[6:7] offset:112
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[6:7] offset:128
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[6:7] offset:144
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[6:7] offset:160
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[6:7] offset:176
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[6:7] offset:192
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[6:7] offset:208
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[6:7] offset:224
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] offset:240
+; GFX906-NEXT: .LBB6_2: ; %bb.2
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT: v_mov_b32_e32 v0, v57
+; GFX906-NEXT: v_mov_b32_e32 v1, v58
+; GFX906-NEXT: v_mov_b32_e32 v2, v59
+; GFX906-NEXT: v_mov_b32_e32 v3, v60
+; GFX906-NEXT: v_mov_b32_e32 v60, v56
+; GFX906-NEXT: v_mov_b32_e32 v59, v55
+; GFX906-NEXT: v_mov_b32_e32 v58, v54
+; GFX906-NEXT: v_mov_b32_e32 v57, v53
+; GFX906-NEXT: v_mov_b32_e32 v56, v52
+; GFX906-NEXT: v_mov_b32_e32 v55, v51
+; GFX906-NEXT: v_mov_b32_e32 v54, v50
+; GFX906-NEXT: v_mov_b32_e32 v53, v49
+; GFX906-NEXT: v_mov_b32_e32 v52, v48
+; GFX906-NEXT: v_mov_b32_e32 v51, v47
+; GFX906-NEXT: v_mov_b32_e32 v50, v46
+; GFX906-NEXT: v_mov_b32_e32 v49, v45
+; GFX906-NEXT: v_mov_b32_e32 v48, v44
+; GFX906-NEXT: v_mov_b32_e32 v47, v43
+; GFX906-NEXT: v_mov_b32_e32 v46, v42
+; GFX906-NEXT: v_mov_b32_e32 v45, v41
+; GFX906-NEXT: v_mov_b32_e32 v44, v40
+; GFX906-NEXT: v_mov_b32_e32 v43, v39
+; GFX906-NEXT: v_mov_b32_e32 v42, v38
+; GFX906-NEXT: v_mov_b32_e32 v41, v37
+; GFX906-NEXT: v_mov_b32_e32 v40, v36
+; GFX906-NEXT: v_mov_b32_e32 v39, v35
+; GFX906-NEXT: v_mov_b32_e32 v38, v34
+; GFX906-NEXT: v_mov_b32_e32 v37, v33
+; GFX906-NEXT: v_mov_b32_e32 v36, v32
+; GFX906-NEXT: v_mov_b32_e32 v35, v31
+; GFX906-NEXT: v_mov_b32_e32 v34, v30
+; GFX906-NEXT: v_mov_b32_e32 v33, v29
+; GFX906-NEXT: v_mov_b32_e32 v32, v28
+; GFX906-NEXT: v_mov_b32_e32 v31, v27
+; GFX906-NEXT: v_mov_b32_e32 v30, v26
+; GFX906-NEXT: v_mov_b32_e32 v29, v25
+; GFX906-NEXT: v_mov_b32_e32 v28, v24
+; GFX906-NEXT: v_mov_b32_e32 v27, v23
+; GFX906-NEXT: v_mov_b32_e32 v26, v22
+; GFX906-NEXT: v_mov_b32_e32 v25, v21
+; GFX906-NEXT: v_mov_b32_e32 v24, v20
+; GFX906-NEXT: v_mov_b32_e32 v23, v19
+; GFX906-NEXT: v_mov_b32_e32 v22, v18
+; GFX906-NEXT: v_mov_b32_e32 v21, v17
+; GFX906-NEXT: v_mov_b32_e32 v20, v16
+; GFX906-NEXT: v_mov_b32_e32 v19, v15
+; GFX906-NEXT: v_mov_b32_e32 v18, v14
+; GFX906-NEXT: v_mov_b32_e32 v17, v13
+; GFX906-NEXT: v_mov_b32_e32 v16, v12
+; GFX906-NEXT: v_mov_b32_e32 v15, v11
+; GFX906-NEXT: v_mov_b32_e32 v14, v10
+; GFX906-NEXT: v_mov_b32_e32 v13, v9
+; GFX906-NEXT: v_mov_b32_e32 v12, v8
+; GFX906-NEXT: v_mov_b32_e32 v11, v7
+; GFX906-NEXT: v_mov_b32_e32 v10, v6
+; GFX906-NEXT: v_mov_b32_e32 v9, v5
+; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT: v_mov_b32_e32 v4, 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[2:3]
+; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[2:3] offset:16
+; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[2:3] offset:32
+; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[2:3] offset:48
+; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[2:3] offset:64
+; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[2:3] offset:80
+; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[2:3] offset:96
+; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[2:3] offset:112
+; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[2:3] offset:128
+; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[2:3] offset:144
+; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[2:3] offset:160
+; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[2:3] offset:176
+; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[2:3] offset:192
+; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[2:3] offset:208
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+
+define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: repeat_successor:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dword s2, s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: s_cmp_lt_i32 s2, 3
+; GFX906-NEXT: s_cbranch_scc0 .LBB7_3
+; GFX906-NEXT: ; %bb.1: ; %LeafBlock
+; GFX906-NEXT: s_cmp_ge_i32 s2, 1
+; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
+; GFX906-NEXT: ; %bb.2:
+; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX906-NEXT: global_load_dword v0, v0, s[4:5]
+; GFX906-NEXT: s_branch .LBB7_5
+; GFX906-NEXT: .LBB7_3: ; %LeafBlock5
+; GFX906-NEXT: s_cmp_eq_u32 s2, 3
+; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
+; GFX906-NEXT: ; %bb.4: ; %sw.bb5
+; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX906-NEXT: global_load_dword v0, v0, s[6:7]
+; GFX906-NEXT: .LBB7_5: ; %return.sink.split
+; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX906-NEXT: v_mov_b32_e32 v1, 0
+; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX906-NEXT: .LBB7_6: ; %return
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+ switch i32 %in, label %return [
+ i32 1, label %return.sink.split
+ i32 2, label %return.sink.split
+ i32 3, label %sw.bb5
+ ]
+
+sw.bb5:
+ br label %return.sink.split
+
+return.sink.split:
+ %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
+ store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+
+return:
+ ret void
+}
+
+define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_phi_chain:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1
+; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB8_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3]
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; GFX906-NEXT: s_and_b64 s[2:3], exec, vcc
+; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT: .LBB8_2: ; %Flow
+; GFX906-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX906-NEXT: s_cbranch_execz .LBB8_4
+; GFX906-NEXT: ; %bb.3: ; %bb.2
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5]
+; GFX906-NEXT: .LBB8_4: ; %bb.3
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+ %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
+ br label %bb.3
+
+bb.3:
+ %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
+ store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_multi_block:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v1, v3
+; GFX906-NEXT: v_mov_b32_e32 v2, v4
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB9_4
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[2:3]
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB9_3
+; GFX906-NEXT: ; %bb.2: ; %bb.2
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT: .LBB9_3: ; %Flow
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: .LBB9_4: ; %bb.3
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.3
+bb.1:
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+ store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
+ br label %bb.3
+
+bb.3:
+ %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
+ store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_loop_carried:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT: v_mov_b32_e32 v3, 8
+; GFX906-NEXT: v_mov_b32_e32 v2, 0xff
+; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dword v1, v1, s[2:3]
+; GFX906-NEXT: s_mov_b64 s[2:3], 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0
+; GFX906-NEXT: v_mov_b32_e32 v2, 24
+; GFX906-NEXT: .LBB10_1: ; %bb.1
+; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1
+; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_cbranch_execnz .LBB10_1
+; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+ br label %bb.1
+
+bb.1:
+ %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+ br label %bb.2
+
+bb.2:
+ store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index 93b9aeac3cd3f..11772d252a16f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -987,8 +987,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) {
; OPT-NEXT: entry:
; OPT-NEXT: [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
; OPT-NEXT: switch i8 [[COND:%.*]], label [[ELSE:%.*]] [
-; OPT-NEXT: i8 0, label [[THEN_1:%.*]]
-; OPT-NEXT: i8 3, label [[THEN_2:%.*]]
+; OPT-NEXT: i8 0, label [[THEN_1:%.*]]
+; OPT-NEXT: i8 3, label [[THEN_2:%.*]]
; OPT-NEXT: ]
; OPT: then.1:
; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> <i32 0, i32 1>
@@ -1025,8 +1025,8 @@ define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) {
; NOOPT-NEXT: entry:
; NOOPT-NEXT: [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3
; NOOPT-NEXT: switch i8 [[COND:%.*]], label [[ELSE:%.*]] [
-; NOOPT-NEXT: i8 0, label [[THEN_1:%.*]]
-; NOOPT-NEXT: i8 3, label [[THEN_2:%.*]]
+; NOOPT-NEXT: i8 0, label [[THEN_1:%.*]]
+; NOOPT-NEXT: i8 3, label [[THEN_2:%.*]]
; NOOPT-NEXT: ]
; NOOPT: then.1:
; NOOPT-NEXT: br label [[FINALLY:%.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index 53acbb6a7bceb..1e5ec361d154c 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -8,29 +8,30 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_bitcmp0_b32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %bb10
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[8:9]
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v0, s[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v0
-; CHECK-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v1
+; CHECK-NEXT: v_and_b32_e32 v7, 0xff, v8
+; CHECK-NEXT: v_bfe_u32 v6, v8, 8, 8
+; CHECK-NEXT: v_bfe_u32 v5, v8, 16, 8
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8
+; CHECK-NEXT: v_and_b32_e32 v3, 0xff, v9
+; CHECK-NEXT: v_bfe_u32 v2, v9, 8, 8
+; CHECK-NEXT: v_bfe_u32 v1, v9, 16, 8
+; CHECK-NEXT: v_lshrrev_b32_e32 v0, 24, v9
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v5, 0
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: v_mov_b32_e32 v7, 0
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: .LBB0_3: ; %bb41
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x48
; CHECK-NEXT: v_mov_b32_e32 v8, s10
@@ -47,16 +48,16 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
; CHECK-NEXT: v_mov_b32_e32 v19, s21
; CHECK-NEXT: v_mov_b32_e32 v20, s22
; CHECK-NEXT: v_mov_b32_e32 v21, s23
-; CHECK-NEXT: flat_store_byte v[8:9], v0
-; CHECK-NEXT: flat_store_byte v[10:11], v7
-; CHECK-NEXT: flat_store_byte v[12:13], v6
-; CHECK-NEXT: flat_store_byte v[14:15], v5
-; CHECK-NEXT: flat_store_byte v[16:17], v1
-; CHECK-NEXT: flat_store_byte v[18:19], v4
-; CHECK-NEXT: flat_store_byte v[20:21], v3
+; CHECK-NEXT: flat_store_byte v[8:9], v7
+; CHECK-NEXT: flat_store_byte v[10:11], v6
+; CHECK-NEXT: flat_store_byte v[12:13], v5
+; CHECK-NEXT: flat_store_byte v[14:15], v4
+; CHECK-NEXT: flat_store_byte v[16:17], v3
+; CHECK-NEXT: flat_store_byte v[18:19], v2
+; CHECK-NEXT: flat_store_byte v[20:21], v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT: flat_store_byte v[2:3], v0
; CHECK-NEXT: s_endpgm
bb:
br i1 %arg, label %bb10, label %bb41
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 6dabd8c0b83ea..efbbe2b27f10f 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -13,9 +13,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -30,27 +30,25 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v3, v5, v3
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v6, v2
+; SI-NEXT: v_or_b32_e32 v2, v4, v5
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB0_3
; SI-NEXT: s_branch .LBB0_4
; SI-NEXT: .LBB0_2:
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB0_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -63,29 +61,29 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v4, v0
+; SI-NEXT: v_or_b32_e32 v2, v2, v1
; SI-NEXT: .LBB0_4: ; %exit
-; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
-; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT: v_mov_b32_e32 v3, 0xffff
-; SI-NEXT: v_mov_b32_e32 v4, 0x8000
-; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v6, 1
+; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2
+; SI-NEXT: v_bfe_i32 v1, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v3, 0, 16
+; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v4, 1
+; SI-NEXT: v_mov_b32_e32 v5, 0xffff
+; SI-NEXT: v_mov_b32_e32 v6, 0x8000
; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v4
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v4
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_8xi16_extract_4xi16:
@@ -180,26 +178,23 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v6, v3
-; SI-NEXT: v_or_b32_e32 v5, v5, v7
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v6, v2
+; SI-NEXT: v_or_b32_e32 v4, v4, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB1_3
; SI-NEXT: s_branch .LBB1_4
; SI-NEXT: .LBB1_2:
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB1_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
@@ -214,39 +209,39 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v0
-; SI-NEXT: v_or_b32_e32 v5, v5, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_or_b32_e32 v5, v4, v0
+; SI-NEXT: v_or_b32_e32 v4, v2, v1
; SI-NEXT: .LBB1_4: ; %exit
-; SI-NEXT: v_bfe_i32 v0, v5, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4
+; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT: v_bfe_i32 v3, v3, 0, 16
-; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT: v_mov_b32_e32 v4, 0xffff
-; SI-NEXT: v_mov_b32_e32 v5, 0x8000
-; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v7, 1
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
+; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v5, 1
+; SI-NEXT: v_mov_b32_e32 v6, 0xffff
+; SI-NEXT: v_mov_b32_e32 v7, 0x8000
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
+; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v4
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_or_b32_e32 v0, v1, v8
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
@@ -499,9 +494,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -532,27 +527,25 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v3, v5, v3
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v6, v2
+; SI-NEXT: v_or_b32_e32 v2, v4, v5
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB3_3
; SI-NEXT: s_branch .LBB3_4
; SI-NEXT: .LBB3_2:
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB3_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -581,29 +574,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v4, v0
+; SI-NEXT: v_or_b32_e32 v2, v2, v1
; SI-NEXT: .LBB3_4: ; %exit
-; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
-; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT: v_mov_b32_e32 v3, 0xffff
-; SI-NEXT: v_mov_b32_e32 v4, 0x8000
-; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v6, 1
+; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2
+; SI-NEXT: v_bfe_i32 v1, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v3, 0, 16
+; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v4, 1
+; SI-NEXT: v_mov_b32_e32 v5, 0xffff
+; SI-NEXT: v_mov_b32_e32 v6, 0x8000
; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v3, v4
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v4
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_4xi16:
@@ -710,13 +703,13 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -734,18 +727,15 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; SI-NEXT: v_or_b32_e32 v2, v7, v2
-; SI-NEXT: v_or_b32_e32 v3, v6, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_or_b32_e32 v5, v6, v2
+; SI-NEXT: v_or_b32_e32 v4, v4, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB4_3
; SI-NEXT: s_branch .LBB4_4
; SI-NEXT: .LBB4_2:
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB4_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
@@ -760,11 +750,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -785,29 +775,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT: v_or_b32_e32 v5, v4, v0
+; SI-NEXT: v_or_b32_e32 v4, v2, v1
; SI-NEXT: .LBB4_4: ; %exit
-; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4
+; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
-; SI-NEXT: v_mov_b32_e32 v4, 0xffff
-; SI-NEXT: v_mov_b32_e32 v5, 0x8000
-; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v7, 1
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v5, 1
+; SI-NEXT: v_mov_b32_e32 v6, 0xffff
+; SI-NEXT: v_mov_b32_e32 v7, 0x8000
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
+; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; SI-NEXT: v_or_b32_e32 v0, v1, v8
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1205,21 +1195,21 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_mov_b32 s39, 0xf000
; SI-NEXT: s_mov_b32 s36, s38
; SI-NEXT: s_mov_b32 s37, s38
-; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1237,46 +1227,39 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
-; SI-NEXT: v_or_b32_e32 v3, v11, v2
-; SI-NEXT: v_or_b32_e32 v8, v8, v12
-; SI-NEXT: v_or_b32_e32 v2, v10, v13
-; SI-NEXT: v_or_b32_e32 v9, v9, v14
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; SI-NEXT: v_or_b32_e32 v5, v10, v2
+; SI-NEXT: v_or_b32_e32 v4, v8, v3
+; SI-NEXT: v_or_b32_e32 v3, v7, v9
+; SI-NEXT: v_or_b32_e32 v2, v6, v11
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB7_3
; SI-NEXT: s_branch .LBB7_4
; SI-NEXT: .LBB7_2:
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr2
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; implicit-def: $vgpr8
-; SI-NEXT: ; implicit-def: $vgpr6
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr7
+; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB7_3: ; %T
; SI-NEXT: s_mov_b32 s39, 0xf000
; SI-NEXT: s_mov_b32 s36, s38
; SI-NEXT: s_mov_b32 s37, s38
-; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1294,52 +1277,52 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5
-; SI-NEXT: v_or_b32_e32 v3, v3, v0
-; SI-NEXT: v_or_b32_e32 v8, v8, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; SI-NEXT: v_or_b32_e32 v5, v8, v0
+; SI-NEXT: v_or_b32_e32 v4, v7, v1
+; SI-NEXT: v_or_b32_e32 v3, v6, v9
; SI-NEXT: v_or_b32_e32 v2, v2, v10
-; SI-NEXT: v_or_b32_e32 v9, v9, v11
; SI-NEXT: .LBB7_4: ; %exit
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8
-; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_movk_i32 s34, 0x3800
-; SI-NEXT: v_mov_b32_e32 v8, 0x3d00
-; SI-NEXT: v_mov_b32_e32 v9, 0x3900
-; SI-NEXT: v_mov_b32_e32 v10, 0x3d000000
-; SI-NEXT: v_mov_b32_e32 v11, 0x39000000
+; SI-NEXT: v_mov_b32_e32 v8, 0x3d000000
+; SI-NEXT: v_mov_b32_e32 v9, 0x39000000
+; SI-NEXT: v_mov_b32_e32 v10, 0x3d00
+; SI-NEXT: v_mov_b32_e32 v11, 0x3900
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v12, v8, v9, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v1
+; SI-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2
+; SI-NEXT: v_cndmask_b32_e32 v13, v8, v9, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4
; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6
+; SI-NEXT: v_cndmask_b32_e32 v14, v8, v9, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5
+; SI-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7
; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6
-; SI-NEXT: v_cndmask_b32_e32 v12, v10, v11, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7
-; SI-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4
-; SI-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_or_b32_e32 v4, v5, v12
-; SI-NEXT: v_or_b32_e32 v6, v3, v7
-; SI-NEXT: v_or_b32_e32 v2, v2, v8
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_alignbit_b32 v5, v6, v12, 16
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v12
+; SI-NEXT: v_or_b32_e32 v4, v1, v13
+; SI-NEXT: v_or_b32_e32 v6, v2, v14
+; SI-NEXT: v_or_b32_e32 v2, v3, v5
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_alignbit_b32 v1, v2, v12, 16
+; SI-NEXT: v_alignbit_b32 v5, v6, v13, 16
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_8xi16_0:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 15abf44f3a0ea..36a93bd2511ce 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -1,26 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-; GCN-LABEL: extract_2xi16
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: v_bfe_i32
-; GCN: v_bfe_i32
-
define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_2xi16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_cbranch_execz .LBB0_2
+; GCN-NEXT: ; %bb.1: ; %F
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: buffer_load_ushort v0, v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:2 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:4 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:6 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:8 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:10 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:12 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64 offset:14 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_or_b32_e32 v4, v0, v1
+; GCN-NEXT: ; implicit-def: $vgpr0
+; GCN-NEXT: .LBB0_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB0_4
+; GCN-NEXT: ; %bb.3: ; %T
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:2 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:6 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:10 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:12 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:14 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT: v_or_b32_e32 v4, v2, v0
+; GCN-NEXT: .LBB0_4: ; %exit
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_ashrrev_i32_e32 v0, 16, v4
+; GCN-NEXT: v_bfe_i32 v1, v4, 0, 16
+; GCN-NEXT: v_mov_b32_e32 v2, 0xffff
+; GCN-NEXT: v_mov_b32_e32 v3, 0x8000
+; GCN-NEXT: v_mov_b32_e32 v4, 0xffff8000
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
+; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
+; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -39,9 +95,59 @@ exit:
ret <2 x i16> %r2
}
-; GCN-LABEL: extract_2xi64
-; GCN-COUNT-2: v_cndmask_b32
define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_2xi64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
+; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_cbranch_execz .LBB1_2
+; GCN-NEXT: ; %bb.1: ; %F
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; implicit-def: $vgpr0
+; GCN-NEXT: .LBB1_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB1_4
+; GCN-NEXT: ; %bb.3: ; %T
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: .LBB1_4: ; %exit
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
+; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
+; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7]
+; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v1, vcc
+; GCN-NEXT: v_mov_b32_e32 v1, -1
+; GCN-NEXT: v_mov_b32_e32 v3, -1
+; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -60,9 +166,65 @@ exit:
ret <2 x i64> %r2
}
-; GCN-LABEL: extract_4xi64
-; GCN-COUNT-4: v_cndmask_b32
define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_4xi64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
+; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_cbranch_execz .LBB2_2
+; GCN-NEXT: ; %bb.1: ; %F
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; implicit-def: $vgpr0
+; GCN-NEXT: .LBB2_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB2_4
+; GCN-NEXT: ; %bb.3: ; %T
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: .LBB2_4: ; %exit
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9]
+; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, vcc
+; GCN-NEXT: v_mov_b32_e32 v1, -1
+; GCN-NEXT: v_mov_b32_e32 v3, -1
+; GCN-NEXT: v_mov_b32_e32 v5, -1
+; GCN-NEXT: v_mov_b32_e32 v7, -1
+; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -81,9 +243,92 @@ exit:
ret <4 x i64> %r2
}
-; GCN-LABEL: extract_8xi64
-; GCN-COUNT-8: v_cndmask_b32
define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_8xi64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
+; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_cbranch_execz .LBB3_2
+; GCN-NEXT: ; %bb.1: ; %F
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; implicit-def: $vgpr0
+; GCN-NEXT: .LBB3_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB3_4
+; GCN-NEXT: ; %bb.3: ; %T
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: .LBB3_4: ; %exit
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
+; GCN-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[8:9]
+; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
+; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
+; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
+; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
+; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, s[16:17]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v8, v1, -1, s[8:9]
+; GCN-NEXT: v_cndmask_b32_e64 v10, v1, -1, s[10:11]
+; GCN-NEXT: v_cndmask_b32_e64 v12, v1, -1, s[12:13]
+; GCN-NEXT: v_cndmask_b32_e64 v14, v1, -1, s[14:15]
+; GCN-NEXT: v_mov_b32_e32 v1, -1
+; GCN-NEXT: v_mov_b32_e32 v3, -1
+; GCN-NEXT: v_mov_b32_e32 v5, -1
+; GCN-NEXT: v_mov_b32_e32 v7, -1
+; GCN-NEXT: v_mov_b32_e32 v9, -1
+; GCN-NEXT: v_mov_b32_e32 v11, -1
+; GCN-NEXT: v_mov_b32_e32 v13, -1
+; GCN-NEXT: v_mov_b32_e32 v15, -1
+; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -102,9 +347,59 @@ exit:
ret <8 x i64> %r2
}
-; GCN-LABEL: extract_2xf64
-; GCN-COUNT-2: v_cndmask_b32
define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_2xf64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
+; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_cbranch_execz .LBB4_2
+; GCN-NEXT: ; %bb.1: ; %F
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; implicit-def: $vgpr0
+; GCN-NEXT: .LBB4_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB4_4
+; GCN-NEXT: ; %bb.3: ; %T
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: .LBB4_4: ; %exit
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
+; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v0, -2.0, vcc
+; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v3, v0, -2.0, vcc
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -123,9 +418,65 @@ exit:
ret <2 x double> %r2
}
-; GCN-LABEL: extract_4xf64
-; GCN-COUNT-4: v_cndmask_b32
define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_4xf64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
+; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_cbranch_execz .LBB5_2
+; GCN-NEXT: ; %bb.1: ; %F
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; implicit-def: $vgpr0
+; GCN-NEXT: .LBB5_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB5_4
+; GCN-NEXT: ; %bb.3: ; %T
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: .LBB5_4: ; %exit
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
+; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v1, -2.0, v0, vcc
+; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
+; GCN-NEXT: v_cndmask_b32_e32 v3, -2.0, v0, vcc
+; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[8:9]
+; GCN-NEXT: v_cndmask_b32_e32 v5, -2.0, v0, vcc
+; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[10:11]
+; GCN-NEXT: v_cndmask_b32_e32 v7, -2.0, v0, vcc
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_mov_b32_e32 v6, 0
+; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
@@ -144,9 +495,92 @@ exit:
ret <4 x double> %r2
}
-; GCN-LABEL: extract_8xf64
-; GCN-COUNT-8: v_cndmask_b32
define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
+; GCN-LABEL: extract_8xf64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
+; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_cbranch_execz .LBB6_2
+; GCN-NEXT: ; %bb.1: ; %F
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; implicit-def: $vgpr0
+; GCN-NEXT: .LBB6_2: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB6_4
+; GCN-NEXT: ; %bb.3: ; %T
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_mov_b32 s8, s10
+; GCN-NEXT: s_mov_b32 s9, s10
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: .LBB6_4: ; %exit
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
+; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
+; GCN-NEXT: v_cmp_nlt_f64_e64 s[4:5], -1.0, v[8:9]
+; GCN-NEXT: v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11]
+; GCN-NEXT: v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13]
+; GCN-NEXT: v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17]
+; GCN-NEXT: v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19]
+; GCN-NEXT: v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, -2.0, v0, s[16:17]
+; GCN-NEXT: v_cndmask_b32_e32 v3, -2.0, v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v5, -2.0, v0, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v7, -2.0, v0, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v9, -2.0, v0, s[8:9]
+; GCN-NEXT: v_cndmask_b32_e64 v11, -2.0, v0, s[10:11]
+; GCN-NEXT: v_cndmask_b32_e64 v13, -2.0, v0, s[12:13]
+; GCN-NEXT: v_cndmask_b32_e64 v15, -2.0, v0, s[14:15]
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_mov_b32_e32 v6, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v14, 0
+; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
T:
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 0ff5dd3680dfa..29f9e3bf94d05 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -251,13 +251,13 @@
; GCN-O1-NEXT: Function Alias Analysis Results
; GCN-O1-NEXT: Flatten the CFG
; GCN-O1-NEXT: Dominator Tree Construction
-; GCN-O1-NEXT: Cycle Info Analysis
-; GCN-O1-NEXT: Uniformity Analysis
-; GCN-O1-NEXT: AMDGPU IR late optimizations
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-NEXT: Function Alias Analysis Results
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Code sinking
+; GCN-O1-NEXT: Cycle Info Analysis
+; GCN-O1-NEXT: Uniformity Analysis
+; GCN-O1-NEXT: AMDGPU IR late optimizations
; GCN-O1-NEXT: Post-Dominator Tree Construction
; GCN-O1-NEXT: Unify divergent function exit nodes
; GCN-O1-NEXT: Dominator Tree Construction
@@ -546,13 +546,13 @@
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
; GCN-O1-OPTS-NEXT: Flatten the CFG
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
-; GCN-O1-OPTS-NEXT: Cycle Info Analysis
-; GCN-O1-OPTS-NEXT: Uniformity Analysis
-; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Code sinking
+; GCN-O1-OPTS-NEXT: Cycle Info Analysis
+; GCN-O1-OPTS-NEXT: Uniformity Analysis
+; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations
; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
@@ -853,13 +853,13 @@
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Flatten the CFG
; GCN-O2-NEXT: Dominator Tree Construction
-; GCN-O2-NEXT: Cycle Info Analysis
-; GCN-O2-NEXT: Uniformity Analysis
-; GCN-O2-NEXT: AMDGPU IR late optimizations
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Code sinking
+; GCN-O2-NEXT: Cycle Info Analysis
+; GCN-O2-NEXT: Uniformity Analysis
+; GCN-O2-NEXT: AMDGPU IR late optimizations
; GCN-O2-NEXT: Post-Dominator Tree Construction
; GCN-O2-NEXT: Unify divergent function exit nodes
; GCN-O2-NEXT: Dominator Tree Construction
@@ -1174,13 +1174,13 @@
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Flatten the CFG
; GCN-O3-NEXT: Dominator Tree Construction
-; GCN-O3-NEXT: Cycle Info Analysis
-; GCN-O3-NEXT: Uniformity Analysis
-; GCN-O3-NEXT: AMDGPU IR late optimizations
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Code sinking
+; GCN-O3-NEXT: Cycle Info Analysis
+; GCN-O3-NEXT: Uniformity Analysis
+; GCN-O3-NEXT: AMDGPU IR late optimizations
; GCN-O3-NEXT: Post-Dominator Tree Construction
; GCN-O3-NEXT: Unify divergent function exit nodes
; GCN-O3-NEXT: Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index c9dbadcbd2315..cacdc8237d5f3 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2103,10 +2103,7 @@ define void @crash_lshlrevb16_not_reg_op() {
; NOSDWA: ; %bb.0: ; %bb0
; NOSDWA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; NOSDWA-NEXT: s_mov_b64 s[4:5], 0
-; NOSDWA-NEXT: v_mov_b32_e32 v0, 0xff
-; NOSDWA-NEXT: v_and_b32_e32 v0, s4, v0
-; NOSDWA-NEXT: v_lshlrev_b16_e64 v1, 8, 1
-; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, 0x100
; NOSDWA-NEXT: s_and_b64 vcc, exec, -1
; NOSDWA-NEXT: .LBB22_1: ; %bb1
; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2126,9 +2123,7 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX89: ; %bb.0: ; %bb0
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-NEXT: s_mov_b64 s[4:5], 0
-; GFX89-NEXT: v_lshlrev_b16_e64 v0, 8, 1
-; GFX89-NEXT: v_mov_b32_e32 v1, s4
-; GFX89-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x100
; GFX89-NEXT: s_and_b64 vcc, exec, -1
; GFX89-NEXT: .LBB22_1: ; %bb1
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2148,8 +2143,7 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX9: ; %bb.0: ; %bb0
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: v_lshlrev_b16_e64 v0, 8, 1
-; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x100
; GFX9-NEXT: s_and_b64 vcc, exec, -1
; GFX9-NEXT: .LBB22_1: ; %bb1
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2168,18 +2162,16 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
; GFX10: ; %bb.0: ; %bb0
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b16 v0, 8, 1
-; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo
-; GFX10-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_mov_b64 s[4:5], 0
+; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo
; GFX10-NEXT: .LBB22_1: ; %bb1
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_lshl_b32 s6, s4, 3
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-NEXT: v_lshrrev_b16 v3, s6, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: v_lshrrev_b16 v2, s6, 0x100
; GFX10-NEXT: s_mov_b64 s[4:5], 1
-; GFX10-NEXT: flat_store_byte v[1:2], v3
+; GFX10-NEXT: flat_store_byte v[0:1], v2
; GFX10-NEXT: s_cbranch_vccnz .LBB22_1
; GFX10-NEXT: ; %bb.2: ; %DummyReturnBlock
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index f78b408d78255..0ac7858e29089 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
@@ -6,27 +6,31 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v5, 2, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX906-NEXT: v_mov_b32_e32 v3, 8
; GFX906-NEXT: v_mov_b32_e32 v1, 0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v2, v5, s[4:5]
+; GFX906-NEXT: global_load_dword v4, v2, s[4:5]
+; GFX906-NEXT: s_mov_b32 s4, 0xff0000
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB0_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v2, v5, s[6:7]
+; GFX906-NEXT: global_load_dword v0, v2, s[6:7]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2
; GFX906-NEXT: .LBB0_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v4
-; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_byte v1, v3, s[2:3] offset:2
-; GFX906-NEXT: global_store_short v1, v0, s[2:3]
+; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[2:3] offset:2
+; GFX906-NEXT: global_store_short v1, v4, s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -50,31 +54,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v6, 2, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX906-NEXT: v_mov_b32_e32 v1, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v2, v6, s[4:5]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX906-NEXT: global_load_dword v2, v3, s[4:5]
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB1_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v2, v6, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX906-NEXT: global_load_dword v2, v3, s[6:7]
; GFX906-NEXT: .LBB1_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v5
-; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dword v1, v2, s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -98,32 +90,23 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v7, 3, v0
-; GFX906-NEXT: v_mov_b32_e32 v5, 0
+; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX906-NEXT: v_mov_b32_e32 v3, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[4:5]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB2_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[6:7]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX906-NEXT: .LBB2_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v6
-; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_byte v5, v2, s[2:3] offset:4
-; GFX906-NEXT: global_store_dword v5, v0, s[2:3]
+; GFX906-NEXT: global_store_byte v3, v2, s[2:3] offset:4
+; GFX906-NEXT: global_store_dword v3, v1, s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -147,42 +130,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX906-NEXT: v_mov_b32_e32 v3, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[4:5]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5]
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB3_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7]
; GFX906-NEXT: .LBB3_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v9
-; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7
-; GFX906-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v6
-; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4
-; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -206,64 +166,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v18, 4, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0
; GFX906-NEXT: v_mov_b32_e32 v5, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[4:5]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[4:5]
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB4_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[6:7]
; GFX906-NEXT: .LBB4_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v17
-; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15
-; GFX906-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14
-; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v12
-; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v11
-; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v9
-; GFX906-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v8
-; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v6
-; GFX906-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v5, v[0:3], s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -286,114 +201,24 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-LABEL: v32i8_liveout:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v31, 5, v0
-; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0
; GFX906-NEXT: v_mov_b32_e32 v9, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[4:5] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[4:5]
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v18, 24, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v1
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8
-; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8
-; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8
-; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7
-; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7
-; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6
-; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6
-; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6
-; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5
-; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[4:5] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[4:5]
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB5_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[6:7] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v18, 24, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v1
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8
-; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8
-; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8
-; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7
-; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7
-; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6
-; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6
-; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6
-; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5
-; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[6:7] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[6:7]
; GFX906-NEXT: .LBB5_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30
-; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v33
-; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT: v_lshlrev_b16_e32 v27, 8, v27
-; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26
-; GFX906-NEXT: v_lshlrev_b16_e32 v24, 8, v24
-; GFX906-NEXT: v_lshlrev_b16_e32 v23, 8, v23
-; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1]
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v20
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v18
-; GFX906-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v17
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v15
-; GFX906-NEXT: v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v14
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v12
-; GFX906-NEXT: v_or_b32_sdwa v5, v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v11
-; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[2:3] offset:16
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[2:3]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -415,1572 +240,595 @@ bb.2:
define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
; GFX906-LABEL: v256i8_liveout:
; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: v_lshlrev_b32_e32 v61, 3, v0
; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX906-NEXT: s_mov_b32 s10, -1
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:240
; GFX906-NEXT: s_mov_b32 s11, 0xe00000
; GFX906-NEXT: s_add_u32 s8, s8, s3
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v63, 3, v0
; GFX906-NEXT: s_addc_u32 s9, s9, 0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:240
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[4:5] offset:224
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[4:5] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[4:5] offset:192
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX906-NEXT: v_mov_b32_e32 v4, 0
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 ; 4-byte Folded Spill
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[4:5] offset:160
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[4:5] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[4:5] offset:128
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[4:5] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[4:5] offset:96
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[4:5] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[4:5] offset:64
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[4:5] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[4:5] offset:32
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[4:5] offset:16
+; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:224
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[4:5]
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[4:5] offset:208
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[4:5] offset:192
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[4:5] offset:176
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[4:5] offset:160
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[4:5] offset:144
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[4:5] offset:128
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[4:5] offset:112
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[4:5] offset:96
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[4:5] offset:80
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[4:5] offset:64
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[4:5] offset:48
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[4:5] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[4:5] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[4:5]
+; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7] offset:240
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[6:7] offset:224
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[6:7] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[6:7] offset:192
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v3
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v3
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v3
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v2
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v2
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v1
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v1
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v0
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v0
-; GFX906-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] offset:240
+; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v0
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[6:7] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[6:7] offset:160
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[6:7] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[6:7] offset:128
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[6:7] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[6:7] offset:96
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[6:7] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[6:7] offset:64
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[6:7] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[6:7] offset:32
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53
-; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[6:7] offset:16
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(12)
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1
-; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0
-; GFX906-NEXT: buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1
-; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[6:7] offset:224
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[6:7] offset:208
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[6:7] offset:192
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[6:7] offset:176
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[6:7] offset:160
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[6:7] offset:144
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[6:7] offset:128
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[6:7] offset:112
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[6:7] offset:96
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[6:7] offset:80
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[6:7] offset:64
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[6:7] offset:48
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[6:7] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[6:7] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7]
; GFX906-NEXT: .LBB6_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62
-; GFX906-NEXT: v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v59, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v58, 8, v58
-; GFX906-NEXT: v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v57, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v58, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v55, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v54, 8, v54
-; GFX906-NEXT: v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v53, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v54, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v51, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v50, 8, v50
-; GFX906-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v49, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v50, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v47, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v46, 8, v46
-; GFX906-NEXT: v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v45, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v46, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:64
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v43, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v42, 8, v42
-; GFX906-NEXT: v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v41, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v42, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:80
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v39, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v38, 8, v38
-; GFX906-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v37, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v38, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:96
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v34, 8, v34
-; GFX906-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:112
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v31, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30
-; GFX906-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v29, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v30, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:128
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v27, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26
-; GFX906-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v25, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v26, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:144
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v23, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v22, 8, v22
-; GFX906-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v21, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v22, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:160
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:96
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[0:1] offset:80
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[0:1] offset:64
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[0:1] offset:48
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[0:1] offset:32
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:16
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v19, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v18, 8, v18
-; GFX906-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240
+; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] offset:224
+; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[0:1] offset:208
+; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[0:1] offset:192
+; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[0:1] offset:176
+; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[0:1] offset:160
+; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[0:1] offset:144
+; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[0:1] offset:128
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: repeat_successor:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dword s8, s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: s_cmp_lt_i32 s8, 3
+; GFX906-NEXT: s_cbranch_scc0 .LBB7_3
+; GFX906-NEXT: ; %bb.1: ; %LeafBlock
+; GFX906-NEXT: s_cmp_gt_i32 s8, 0
+; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
+; GFX906-NEXT: ; %bb.2:
+; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX906-NEXT: global_load_dword v0, v0, s[4:5]
+; GFX906-NEXT: s_branch .LBB7_5
+; GFX906-NEXT: .LBB7_3: ; %LeafBlock5
+; GFX906-NEXT: s_cmp_eq_u32 s8, 3
+; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
+; GFX906-NEXT: ; %bb.4: ; %sw.bb5
+; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX906-NEXT: global_load_dword v0, v0, s[6:7]
+; GFX906-NEXT: .LBB7_5: ; %return.sink.split
+; GFX906-NEXT: v_mov_b32_e32 v1, 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v17, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v18, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT: .LBB7_6: ; %return
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+ switch i32 %in, label %return [
+ i32 1, label %return.sink.split
+ i32 2, label %return.sink.split
+ i32 3, label %sw.bb5
+ ]
+
+sw.bb5:
+ br label %return.sink.split
+
+return.sink.split:
+ %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
+ store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+
+return:
+ ret void
+}
+
+define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_phi_chain:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB8_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3]
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; GFX906-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT: .LBB8_2: ; %Flow
+; GFX906-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX906-NEXT: s_cbranch_execz .LBB8_4
+; GFX906-NEXT: ; %bb.3: ; %bb.2
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5]
+; GFX906-NEXT: .LBB8_4: ; %bb.3
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:176
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v14, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
-; GFX906-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v13, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v13
-; GFX906-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+ %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
+ br label %bb.3
+
+bb.3:
+ %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
+ store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
+ ret void
+}
+
+
+define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_phi_zeroinit:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1]
+; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB9_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[2:3]
+; GFX906-NEXT: s_mov_b32 s2, 0
+; GFX906-NEXT: s_mov_b32 s3, s2
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_mov_b32_e32 v4, s3
+; GFX906-NEXT: v_mov_b32_e32 v3, s2
+; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; GFX906-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT: .LBB9_2: ; %Flow
+; GFX906-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX906-NEXT: s_cbranch_execz .LBB9_4
+; GFX906-NEXT: ; %bb.3: ; %bb.2
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v1, v3
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: v_mov_b32_e32 v2, v4
+; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT: .LBB9_4: ; %bb.3
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+ %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ zeroinitializer, %bb.1 ]
+ store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
+ br label %bb.3
+
+bb.3:
+ %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
+ store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_phi_const:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
+; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:192
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB10_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; GFX906-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX906-NEXT: v_mov_b32_e32 v1, 1
+; GFX906-NEXT: v_mov_b32_e32 v8, 2
+; GFX906-NEXT: v_mov_b32_e32 v6, 3
+; GFX906-NEXT: v_mov_b32_e32 v7, 4
+; GFX906-NEXT: v_mov_b32_e32 v2, 5
+; GFX906-NEXT: v_mov_b32_e32 v5, 6
+; GFX906-NEXT: v_mov_b32_e32 v3, 7
+; GFX906-NEXT: v_mov_b32_e32 v4, 8
+; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX906-NEXT: .LBB10_2: ; %Flow
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX906-NEXT: s_cbranch_execz .LBB10_4
+; GFX906-NEXT: ; %bb.3: ; %bb.2
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v8
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7
+; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v5
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4
; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_mov_b32_e32 v9, 0
; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v10, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v9, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b16_e32 v9, 8, v9
-; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:208
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: global_store_dwordx2 v9, v[0:1], s[4:5]
+; GFX906-NEXT: .LBB10_4: ; %bb.3
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+ %tmp5 = phi <8 x i8> [ %vec1, %entry ], [<i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, %bb.1 ]
+ store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
+ br label %bb.3
+
+bb.3:
+ %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
+ store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: v8i8_multi_block:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX906-NEXT: v_mov_b32_e32 v5, 0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[0:1]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_mov_b32_e32 v1, v3
+; GFX906-NEXT: v_mov_b32_e32 v2, v4
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB11_4
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[2:3]
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB11_3
+; GFX906-NEXT: ; %bb.2: ; %bb.2
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT: .LBB11_3: ; %Flow
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: .LBB11_4: ; %bb.3
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
-; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: s_waitcnt vmcnt(3)
-; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(2)
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[6:7]
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.3
+bb.1:
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+ store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
+ br label %bb.3
+
+bb.3:
+ %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
+ store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: v32i8_loop_carried:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0
+; GFX906-NEXT: s_mov_b32 s4, 0x2000604
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dword v1, v1, s[2:3]
+; GFX906-NEXT: s_mov_b64 s[2:3], 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_mov_b32_e32 v0, v1
+; GFX906-NEXT: .LBB12_1: ; %bb.1
+; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc
+; GFX906-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
+; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_cbranch_execnz .LBB12_1
+; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT: v_mov_b32_e32 v1, 0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX906-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+ br label %bb.1
+
+bb.1:
+ %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+ br label %bb.2
+
+bb.2:
+ store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+; Should not have instances of "Instruction does not dominate all uses!"
+
+define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) {
+; GFX906-LABEL: v8i8_multiuse_multiblock:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[4:5]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT: v_lshrrev_b16_e32 v3, 8, v1
+; GFX906-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX906-NEXT: s_cbranch_execz .LBB13_2
+; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: v_lshlrev_b16_e32 v6, 8, v2
+; GFX906-NEXT: s_mov_b32 s6, 0x6070504
+; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
+; GFX906-NEXT: v_and_b32_e32 v5, 0xffffff00, v1
+; GFX906-NEXT: v_or_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_e32 v7, 0xffffff00, v2
+; GFX906-NEXT: v_perm_b32 v8, v1, v1, s6
+; GFX906-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; GFX906-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX906-NEXT: v_mov_b32_e32 v4, 0
+; GFX906-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v7, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX906-NEXT: v_or_b32_e32 v7, v6, v7
+; GFX906-NEXT: v_or_b32_e32 v5, v6, v5
+; GFX906-NEXT: global_store_dword v4, v1, s[8:9]
+; GFX906-NEXT: global_store_dword v4, v8, s[8:9] offset:8
+; GFX906-NEXT: global_store_dword v4, v7, s[8:9] offset:16
+; GFX906-NEXT: global_store_dword v4, v5, s[8:9] offset:24
+; GFX906-NEXT: .LBB13_2: ; %Flow
+; GFX906-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX906-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX906-NEXT: s_cbranch_execz .LBB13_4
+; GFX906-NEXT: ; %bb.3: ; %bb.2
+; GFX906-NEXT: v_and_b32_e32 v5, 0xffffff00, v2
+; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_e32 v6, 0xffffff00, v1
+; GFX906-NEXT: s_mov_b32 s2, 0xc0c0001
+; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX906-NEXT: v_or_b32_sdwa v6, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_perm_b32 v9, 0, v2, s2
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX906-NEXT: v_perm_b32 v7, 0, v1, s2
+; GFX906-NEXT: s_mov_b32 s3, 0xffff0000
+; GFX906-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX906-NEXT: v_and_or_b32 v8, v1, s3, v7
+; GFX906-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX906-NEXT: global_store_dword v0, v4, s[10:11]
+; GFX906-NEXT: global_store_dword v0, v5, s[10:11] offset:8
+; GFX906-NEXT: global_store_dword v0, v8, s[10:11] offset:16
+; GFX906-NEXT: global_store_dword v0, v6, s[10:11] offset:24
+; GFX906-NEXT: .LBB13_4: ; %bb.3
+; GFX906-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX906-NEXT: v_lshlrev_b16_e32 v7, 8, v2
+; GFX906-NEXT: v_and_b32_e32 v4, 0xff, v1
+; GFX906-NEXT: v_and_b32_e32 v5, 0xffffff00, v2
+; GFX906-NEXT: v_or_b32_e32 v8, v3, v7
+; GFX906-NEXT: v_or_b32_sdwa v6, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX906-NEXT: v_lshlrev_b16_e32 v9, 8, v1
+; GFX906-NEXT: v_or_b32_e32 v3, v3, v5
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: v_or_b32_e32 v6, v8, v6
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX906-NEXT: v_or_b32_e32 v1, v8, v1
+; GFX906-NEXT: v_or_b32_e32 v4, v3, v4
+; GFX906-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX906-NEXT: global_store_dword v0, v6, s[0:1]
+; GFX906-NEXT: global_store_dword v0, v1, s[0:1] offset:8
+; GFX906-NEXT: global_store_dword v0, v4, s[0:1] offset:16
+; GFX906-NEXT: global_store_dword v0, v2, s[0:1] offset:24
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
%gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
%cmp = icmp ult i32 %idx, 15
br i1 %cmp, label %bb.1, label %bb.2
bb.1:
- br label %bb.2
+ %s1 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %s2 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+ %s3 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ %s4 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+ %gep4 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 0
+ %gep5 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 1
+ %gep6 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 2
+ %gep7 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 3
+ store <4 x i8> %s1, ptr addrspace(1) %gep4, align 4
+ store <4 x i8> %s2, ptr addrspace(1) %gep5, align 4
+ store <4 x i8> %s3, ptr addrspace(1) %gep6, align 4
+ store <4 x i8> %s4, ptr addrspace(1) %gep7, align 4
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
bb.2:
- %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ %s5 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+ %s6 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+ %s7 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+ %s8 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %gep8 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 0
+ %gep9 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 1
+ %gep10 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 2
+ %gep11 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 3
+ store <4 x i8> %s5, ptr addrspace(1) %gep8, align 4
+ store <4 x i8> %s6, ptr addrspace(1) %gep9, align 4
+ store <4 x i8> %s7, ptr addrspace(1) %gep10, align 4
+ store <4 x i8> %s8, ptr addrspace(1) %gep11, align 4
+ br label %bb.3
+
+bb.3:
+ %s9 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+ %s10 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+ %s11 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+ %s12 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+ %gep12 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 0
+ %gep13 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 1
+ %gep14 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 2
+ %gep15 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 3
+ store <4 x i8> %s9, ptr addrspace(1) %gep12, align 4
+ store <4 x i8> %s10, ptr addrspace(1) %gep13, align 4
+ store <4 x i8> %s11, ptr addrspace(1) %gep14, align 4
+ store <4 x i8> %s12, ptr addrspace(1) %gep15, align 4
ret void
}
-declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
new file mode 100644
index 0000000000000..5d2e299aa854a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=amdgcn -mcpu=gfx906 -amdgpu-late-codegenprepare -S -o - %s | FileCheck --check-prefix=GFX906 %s
+
+define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @v3i8_liveout(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX906-NEXT: entry:
+; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4
+; GFX906-NEXT: [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4
+; GFX906-NEXT: [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32
+; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
+; GFX906: bb.1:
+; GFX906-NEXT: br label [[BB_2]]
+; GFX906: bb.2:
+; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP5_TC]] to i24
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
+; GFX906-NEXT: store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT: ret void
+;
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @v4i8_liveout(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: entry:
+; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
+; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
+; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
+; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
+; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
+; GFX906: bb.1:
+; GFX906-NEXT: br label [[BB_2]]
+; GFX906: bb.2:
+; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
+; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
+; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT: ret void
+;
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @v5i8_liveout(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: entry:
+; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8
+; GFX906-NEXT: [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
+; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8
+; GFX906-NEXT: [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
+; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
+; GFX906: bb.1:
+; GFX906-NEXT: br label [[BB_2]]
+; GFX906: bb.2:
+; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
+; GFX906-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
+; GFX906-NEXT: store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT: ret void
+;
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @v8i8_liveout(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: entry:
+; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
+; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
+; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
+; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
+; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
+; GFX906: bb.1:
+; GFX906-NEXT: br label [[BB_2]]
+; GFX906: bb.2:
+; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
+; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
+; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT: ret void
+;
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ br label %bb.2
+
+bb.2:
+ %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @repeat_successor(
+; GFX906-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: entry:
+; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
+; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
+; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
+; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
+; GFX906-NEXT: switch i32 [[IN]], label [[RETURN:%.*]] [
+; GFX906-NEXT: i32 1, label [[RETURN_SINK_SPLIT:%.*]]
+; GFX906-NEXT: i32 2, label [[RETURN_SINK_SPLIT]]
+; GFX906-NEXT: i32 3, label [[SW_BB5:%.*]]
+; GFX906-NEXT: ]
+; GFX906: sw.bb5:
+; GFX906-NEXT: br label [[RETURN_SINK_SPLIT]]
+; GFX906: return.sink.split:
+; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ]
+; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
+; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT: ret void
+; GFX906: return:
+; GFX906-NEXT: ret void
+;
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
+ switch i32 %in, label %return [
+ i32 1, label %return.sink.split
+ i32 2, label %return.sink.split
+ i32 3, label %sw.bb5
+ ]
+
+sw.bb5:
+ br label %return.sink.split
+
+return.sink.split:
+ %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
+ store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
+ ret void
+
+return:
+ ret void
+}
+
+define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: define amdgpu_kernel void @v8i8_phi_chain(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: entry:
+; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
+; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
+; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
+; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
+; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
+; GFX906: bb.1:
+; GFX906-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
+; GFX906-NEXT: br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]]
+; GFX906: bb.2:
+; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
+; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
+; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST0]], align 4
+; GFX906-NEXT: br label [[BB_3]]
+; GFX906: bb.3:
+; GFX906-NEXT: [[TMP7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[TMP5_TC]], [[BB_2]] ]
+; GFX906-NEXT: [[TMP7_TC_BC:%.*]] = bitcast <2 x i32> [[TMP7_TC]] to <8 x i8>
+; GFX906-NEXT: store <8 x i8> [[TMP7_TC_BC]], ptr addrspace(1) [[DST1]], align 4
+; GFX906-NEXT: ret void
+;
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+bb.1:
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+ %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
+ br label %bb.3
+
+bb.3:
+ %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
+ store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
+; GFX906-LABEL: define amdgpu_kernel void @v8i8_multi_block(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST0:%.*]], ptr addrspace(1) nocapture [[DST1:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: entry:
+; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
+; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
+; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
+; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
+; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_3:%.*]]
+; GFX906: bb.1:
+; GFX906-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
+; GFX906-NEXT: br i1 [[CMP2]], label [[BB_2:%.*]], label [[BB_3]]
+; GFX906: bb.2:
+; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <2 x i32> [[VEC1_BC]] to <8 x i8>
+; GFX906-NEXT: store <8 x i8> [[VEC1_BC_BC]], ptr addrspace(1) [[DST0]], align 4
+; GFX906-NEXT: br label [[BB_3]]
+; GFX906: bb.3:
+; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
+; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
+; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST1]], align 4
+; GFX906-NEXT: ret void
+;
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
+ %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
+ %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.3
+bb.1:
+ %cmp2 = icmp ult i32 %idx, 7
+ br i1 %cmp2, label %bb.2, label %bb.3
+
+bb.2:
+ store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
+ br label %bb.3
+
+bb.3:
+ %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
+ store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
+; GFX906-LABEL: define amdgpu_kernel void @v32i8_loop_carried(
+; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) nocapture [[DST:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: entry:
+; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
+; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
+; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
+; GFX906-NEXT: br label [[BB_1:%.*]]
+; GFX906: bb.1:
+; GFX906-NEXT: [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ]
+; GFX906-NEXT: [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8>
+; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8>
+; GFX906-NEXT: [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; GFX906-NEXT: [[VEC2_BC]] = bitcast <4 x i8> [[VEC2]] to i32
+; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
+; GFX906-NEXT: br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]]
+; GFX906: 0:
+; GFX906-NEXT: br label [[BB_2]]
+; GFX906: bb.2:
+; GFX906-NEXT: [[VEC2_BC_BC:%.*]] = bitcast i32 [[VEC2_BC]] to <4 x i8>
+; GFX906-NEXT: store <4 x i8> [[VEC2_BC_BC]], ptr addrspace(1) [[DST]], align 4
+; GFX906-NEXT: ret void
+;
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
+ %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+ br label %bb.1
+
+bb.1:
+ %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
+ %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %cmp = icmp ult i32 %idx, 15
+ br i1 %cmp, label %bb.1, label %bb.2
+ br label %bb.2
+
+bb.2:
+ store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
>From d90e99a593775dcfaed52a7ac8de2dbc8c877b3a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 1 Jul 2024 15:05:22 -0700
Subject: [PATCH 2/3] Separate the loop to resolve use after free
Change-Id: Id3cf508092d5c6321cc980ae168e79d525f558a3
---
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 7623b73d6dd5f..f398524e42657 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -183,6 +183,13 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
for (auto &BB : F)
for (Instruction &I : make_early_inc_range(BB)) {
Changed |= visit(I);
+ }
+
+ // TODO -- combine the loops. visitLoad instruction deletes loads, which may
+ // cause use after free in optimizeLiveType. However, deferring the deletion
+ // of those may corrupt the logic in optimizeLiveType.
+ for (auto &BB : F)
+ for (Instruction &I : make_early_inc_range(BB)) {
Changed |= LRO.optimizeLiveType(&I);
}
>From 88748747b790e0f3388c6fb43a88876a47a25cb6 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 2 Jul 2024 09:13:48 -0700
Subject: [PATCH 3/3] Integrate the loops
Change-Id: I7a3968d4812c2af790648222f9fe78b5af356937
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 47 ++++-----
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 96 +++++++++++++------
2 files changed, 85 insertions(+), 58 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index f398524e42657..4d01ef1e7f3cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -50,6 +50,8 @@ class AMDGPULateCodeGenPrepare
AssumptionCache *AC = nullptr;
UniformityInfo *UA = nullptr;
+ SmallVector<WeakTrackingVH, 8> DeadInsts;
+
public:
static char ID;
@@ -92,8 +94,6 @@ class LiveRegOptimizer {
Type *ConvertToScalar;
/// The set of visited Instructions
SmallPtrSet<Instruction *, 4> Visited;
- /// The set of Instructions to be deleted
- SmallPtrSet<Instruction *, 4> DeadInstrs;
/// Map of Value -> Converted Value
ValueToValueMap ValMap;
/// Map of containing conversions from Optimal Type -> Original Type per BB.
@@ -115,10 +115,8 @@ class LiveRegOptimizer {
/// Check for problematic PHI nodes or cross-bb values based on the value
/// defined by \p I, and coerce to legal types if necessary. For problematic
/// PHI node, we coerce all incoming values in a single invocation.
- bool optimizeLiveType(Instruction *I);
-
- /// Remove all instructions that have become dead (i.e. all the re-typed PHIs)
- void removeDeadInstrs();
+ bool optimizeLiveType(Instruction *I,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts);
// Whether or not the type should be replaced to avoid inefficient
// legalization code
@@ -163,6 +161,7 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+
if (ST.hasScalarSubwordLoads())
return false;
@@ -180,20 +179,13 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
bool Changed = false;
- for (auto &BB : F)
- for (Instruction &I : make_early_inc_range(BB)) {
+ for (auto &BB : reverse(F))
+ for (Instruction &I : make_early_inc_range(reverse(BB))) {
Changed |= visit(I);
+ Changed |= LRO.optimizeLiveType(&I, DeadInsts);
}
- // TODO -- combine the loops. visitLoad instruction deletes loads, which may
- // cause use after free in optimizeLiveType. However, deferring the deletion
- // of those may corrupt the logic in optimizeLiveType.
- for (auto &BB : F)
- for (Instruction &I : make_early_inc_range(BB)) {
- Changed |= LRO.optimizeLiveType(&I);
- }
-
- LRO.removeDeadInstrs();
+ RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);
return Changed;
}
@@ -283,7 +275,8 @@ Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
return Builder.CreateShuffleVector(Converted, ShuffleMask);
}
-bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
+bool LiveRegOptimizer::optimizeLiveType(
+ Instruction *I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
SmallVector<Instruction *, 4> Worklist;
SmallPtrSet<PHINode *, 4> PhiNodes;
SmallPtrSet<Instruction *, 4> Defs;
@@ -373,7 +366,13 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
else
MissingIncVal = true;
}
- DeadInstrs.insert(MissingIncVal ? cast<Instruction>(ValMap[Phi]) : Phi);
+ Instruction *DeadInst = Phi;
+ if (MissingIncVal) {
+ DeadInst = cast<Instruction>(ValMap[Phi]);
+ // Do not use the dead phi
+ ValMap[Phi] = Phi;
+ }
+ DeadInsts.emplace_back(DeadInst);
}
// Coerce back to the original type and replace the uses.
for (Instruction *U : Uses) {
@@ -400,14 +399,6 @@ bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
return true;
}
-void LiveRegOptimizer::removeDeadInstrs() {
- // Remove instrs that have been marked dead after type-coercion.
- for (auto *I : DeadInstrs) {
- I->replaceAllUsesWith(PoisonValue::get(I->getType()));
- I->eraseFromParent();
- }
-}
-
bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
unsigned AS = LI.getPointerAddressSpace();
// Skip non-constant address space.
@@ -479,7 +470,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
auto *NewVal = IRB.CreateBitCast(
IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
LI.replaceAllUsesWith(NewVal);
- RecursivelyDeleteTriviallyDeadInstructions(&LI);
+ DeadInsts.emplace_back(&LI);
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 0ac7858e29089..8cdc60d30a816 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -523,51 +523,87 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-LABEL: v8i8_phi_const:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: ; implicit-def: $vgpr3
+; GFX906-NEXT: ; implicit-def: $vgpr13
+; GFX906-NEXT: ; implicit-def: $vgpr11
+; GFX906-NEXT: ; implicit-def: $vgpr14
+; GFX906-NEXT: ; implicit-def: $vgpr15
+; GFX906-NEXT: ; implicit-def: $vgpr12
+; GFX906-NEXT: ; implicit-def: $vgpr16
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1]
; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v1
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v1
+; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX906-NEXT: s_cbranch_execz .LBB10_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
+; GFX906-NEXT: global_load_dwordx2 v[3:4], v4, s[2:3]
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
-; GFX906-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX906-NEXT: s_and_b64 s[2:3], vcc, exec
; GFX906-NEXT: v_mov_b32_e32 v1, 1
-; GFX906-NEXT: v_mov_b32_e32 v8, 2
-; GFX906-NEXT: v_mov_b32_e32 v6, 3
-; GFX906-NEXT: v_mov_b32_e32 v7, 4
+; GFX906-NEXT: v_mov_b32_e32 v10, 2
+; GFX906-NEXT: v_mov_b32_e32 v9, 3
+; GFX906-NEXT: v_mov_b32_e32 v8, 4
; GFX906-NEXT: v_mov_b32_e32 v2, 5
-; GFX906-NEXT: v_mov_b32_e32 v5, 6
-; GFX906-NEXT: v_mov_b32_e32 v3, 7
-; GFX906-NEXT: v_mov_b32_e32 v4, 8
-; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX906-NEXT: v_mov_b32_e32 v7, 6
+; GFX906-NEXT: v_mov_b32_e32 v6, 7
+; GFX906-NEXT: v_mov_b32_e32 v5, 8
+; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v3
; GFX906-NEXT: .LBB10_2: ; %Flow
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX906-NEXT: s_cbranch_execz .LBB10_4
; GFX906-NEXT: ; %bb.3: ; %bb.2
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v8
-; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7
-; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v5
-; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4
-; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_mov_b32_e32 v9, 0
-; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx2 v9, v[0:1], s[4:5]
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v10
+; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v8
+; GFX906-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v7
+; GFX906-NEXT: v_lshlrev_b16_e32 v11, 8, v5
+; GFX906-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[4:5]
+; GFX906-NEXT: v_mov_b32_e32 v3, v1
+; GFX906-NEXT: v_mov_b32_e32 v13, v10
+; GFX906-NEXT: v_mov_b32_e32 v11, v9
+; GFX906-NEXT: v_mov_b32_e32 v14, v8
+; GFX906-NEXT: v_mov_b32_e32 v4, v2
+; GFX906-NEXT: v_mov_b32_e32 v15, v7
+; GFX906-NEXT: v_mov_b32_e32 v12, v6
+; GFX906-NEXT: v_mov_b32_e32 v16, v5
; GFX906-NEXT: .LBB10_4: ; %bb.3
+; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v13
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14
+; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v16
+; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_mov_b32_e32 v2, 0
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list