[llvm] Revert "[AMDGPU] Add IR LiveReg type-based optimization" (PR #97138)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 28 23:18:27 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Vitaly Buka (vitalybuka)
<details>
<summary>Changes</summary>
Part of #<!-- -->66838.
https://lab.llvm.org/buildbot/#/builders/52/builds/404
https://lab.llvm.org/buildbot/#/builders/55/builds/358
https://lab.llvm.org/buildbot/#/builders/164/builds/518
This reverts commit ded956440739ae326a99cbaef18ce4362e972679.
---
Patch is 279.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/97138.diff
11 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (+2-293)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+2-2)
- (removed) llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll (-636)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll (+19-20)
- (modified) llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll (+188-171)
- (modified) llvm/test/CodeGen/AMDGPU/extract-subvector.ll (+32-466)
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll (+16-8)
- (modified) llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll (+1753-601)
- (removed) llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll (-352)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 7623b73d6dd5f..69fdeaebe0a01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -81,73 +81,6 @@ class AMDGPULateCodeGenPrepare
bool visitLoadInst(LoadInst &LI);
};
-using ValueToValueMap = DenseMap<const Value *, Value *>;
-
-class LiveRegOptimizer {
-private:
- Module *Mod = nullptr;
- const DataLayout *DL = nullptr;
- const GCNSubtarget *ST;
- /// The scalar type to convert to
- Type *ConvertToScalar;
- /// The set of visited Instructions
- SmallPtrSet<Instruction *, 4> Visited;
- /// The set of Instructions to be deleted
- SmallPtrSet<Instruction *, 4> DeadInstrs;
- /// Map of Value -> Converted Value
- ValueToValueMap ValMap;
- /// Map of containing conversions from Optimal Type -> Original Type per BB.
- DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
-
-public:
- /// Calculate the and \p return the type to convert to given a problematic \p
- /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
- Type *calculateConvertType(Type *OriginalType);
- /// Convert the virtual register defined by \p V to the compatible vector of
- /// legal type
- Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
- /// Convert the virtual register defined by \p V back to the original type \p
- /// ConvertType, stripping away the MSBs in cases where there was an imperfect
- /// fit (e.g. v2i32 -> v7i8)
- Value *convertFromOptType(Type *ConvertType, Instruction *V,
- BasicBlock::iterator &InstPt,
- BasicBlock *InsertBlock);
- /// Check for problematic PHI nodes or cross-bb values based on the value
- /// defined by \p I, and coerce to legal types if necessary. For problematic
- /// PHI node, we coerce all incoming values in a single invocation.
- bool optimizeLiveType(Instruction *I);
-
- /// Remove all instructions that have become dead (i.e. all the re-typed PHIs)
- void removeDeadInstrs();
-
- // Whether or not the type should be replaced to avoid inefficient
- // legalization code
- bool shouldReplace(Type *ITy) {
- FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
- if (!VTy)
- return false;
-
- auto TLI = ST->getTargetLowering();
-
- Type *EltTy = VTy->getElementType();
- // If the element size is not less than the convert to scalar size, then we
- // can't do any bit packing
- if (!EltTy->isIntegerTy() ||
- EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
- return false;
-
- // Only coerce illegal types
- TargetLoweringBase::LegalizeKind LK =
- TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
- return LK.first != TargetLoweringBase::TypeLegal;
- }
-
- LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
- DL = &Mod->getDataLayout();
- ConvertToScalar = Type::getInt32Ty(Mod->getContext());
- }
-};
-
} // end anonymous namespace
bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
@@ -169,238 +102,14 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- // "Optimize" the virtual regs that cross basic block boundaries. When
- // building the SelectionDAG, vectors of illegal types that cross basic blocks
- // will be scalarized and widened, with each scalar living in its
- // own register. To work around this, this optimization converts the
- // vectors to equivalent vectors of legal type (which are converted back
- // before uses in subsequent blocks), to pack the bits into fewer physical
- // registers (used in CopyToReg/CopyFromReg pairs).
- LiveRegOptimizer LRO(Mod, &ST);
-
bool Changed = false;
-
for (auto &BB : F)
- for (Instruction &I : make_early_inc_range(BB)) {
+ for (Instruction &I : llvm::make_early_inc_range(BB))
Changed |= visit(I);
- Changed |= LRO.optimizeLiveType(&I);
- }
- LRO.removeDeadInstrs();
return Changed;
}
-Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
- assert(OriginalType->getScalarSizeInBits() <=
- ConvertToScalar->getScalarSizeInBits());
-
- FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);
-
- TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
- TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
- unsigned ConvertEltCount =
- (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
-
- if (OriginalSize <= ConvertScalarSize)
- return IntegerType::get(Mod->getContext(), ConvertScalarSize);
-
- return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
- ConvertEltCount, false);
-}
-
-Value *LiveRegOptimizer::convertToOptType(Instruction *V,
- BasicBlock::iterator &InsertPt) {
- FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
- Type *NewTy = calculateConvertType(V->getType());
-
- TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
- TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
-
- IRBuilder<> Builder(V->getParent(), InsertPt);
- // If there is a bitsize match, we can fit the old vector into a new vector of
- // desired type.
- if (OriginalSize == NewSize)
- return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");
-
- // If there is a bitsize mismatch, we must use a wider vector.
- assert(NewSize > OriginalSize);
- uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
-
- SmallVector<int, 8> ShuffleMask;
- uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
- for (unsigned I = 0; I < OriginalElementCount; I++)
- ShuffleMask.push_back(I);
-
- for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
- ShuffleMask.push_back(OriginalElementCount);
-
- Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
- return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
-}
-
-Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
- BasicBlock::iterator &InsertPt,
- BasicBlock *InsertBB) {
- FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);
-
- TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
- TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
-
- IRBuilder<> Builder(InsertBB, InsertPt);
- // If there is a bitsize match, we simply convert back to the original type.
- if (OriginalSize == NewSize)
- return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");
-
- // If there is a bitsize mismatch, then we must have used a wider value to
- // hold the bits.
- assert(OriginalSize > NewSize);
- // For wide scalars, we can just truncate the value.
- if (!V->getType()->isVectorTy()) {
- Instruction *Trunc = cast<Instruction>(
- Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
- return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
- }
-
- // For wider vectors, we must strip the MSBs to convert back to the original
- // type.
- VectorType *ExpandedVT = VectorType::get(
- Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
- (OriginalSize / NewVTy->getScalarSizeInBits()), false);
- Instruction *Converted =
- cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
-
- unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
- SmallVector<int, 8> ShuffleMask(NarrowElementCount);
- std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
-
- return Builder.CreateShuffleVector(Converted, ShuffleMask);
-}
-
-bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
- SmallVector<Instruction *, 4> Worklist;
- SmallPtrSet<PHINode *, 4> PhiNodes;
- SmallPtrSet<Instruction *, 4> Defs;
- SmallPtrSet<Instruction *, 4> Uses;
-
- Worklist.push_back(cast<Instruction>(I));
- while (!Worklist.empty()) {
- Instruction *II = Worklist.pop_back_val();
-
- if (!Visited.insert(II).second)
- continue;
-
- if (!shouldReplace(II->getType()))
- continue;
-
- if (PHINode *Phi = dyn_cast<PHINode>(II)) {
- PhiNodes.insert(Phi);
- // Collect all the incoming values of problematic PHI nodes.
- for (Value *V : Phi->incoming_values()) {
- // Repeat the collection process for newly found PHI nodes.
- if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
- if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
- Worklist.push_back(OpPhi);
- continue;
- }
-
- Instruction *IncInst = dyn_cast<Instruction>(V);
- // Other incoming value types (e.g. vector literals) are unhandled
- if (!IncInst && !isa<ConstantAggregateZero>(V))
- return false;
-
- // Collect all other incoming values for coercion.
- if (IncInst)
- Defs.insert(IncInst);
- }
- }
-
- // Collect all relevant uses.
- for (User *V : II->users()) {
- // Repeat the collection process for problematic PHI nodes.
- if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
- if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
- Worklist.push_back(OpPhi);
- continue;
- }
-
- Instruction *UseInst = cast<Instruction>(V);
- // Collect all uses of PHINodes and any use the crosses BB boundaries.
- if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
- Uses.insert(UseInst);
- if (!Defs.count(II) && !isa<PHINode>(II)) {
- Defs.insert(II);
- }
- }
- }
- }
-
- // Coerce and track the defs.
- for (Instruction *D : Defs) {
- if (!ValMap.contains(D)) {
- BasicBlock::iterator InsertPt = std::next(D->getIterator());
- Value *ConvertVal = convertToOptType(D, InsertPt);
- assert(ConvertVal);
- ValMap[D] = ConvertVal;
- }
- }
-
- // Construct new-typed PHI nodes.
- for (PHINode *Phi : PhiNodes) {
- ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
- Phi->getNumIncomingValues(),
- Phi->getName() + ".tc", Phi->getIterator());
- }
-
- // Connect all the PHI nodes with their new incoming values.
- for (PHINode *Phi : PhiNodes) {
- PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
- bool MissingIncVal = false;
- for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
- Value *IncVal = Phi->getIncomingValue(I);
- if (isa<ConstantAggregateZero>(IncVal)) {
- Type *NewType = calculateConvertType(Phi->getType());
- NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
- Phi->getIncomingBlock(I));
- } else if (ValMap.contains(IncVal))
- NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
- else
- MissingIncVal = true;
- }
- DeadInstrs.insert(MissingIncVal ? cast<Instruction>(ValMap[Phi]) : Phi);
- }
- // Coerce back to the original type and replace the uses.
- for (Instruction *U : Uses) {
- // Replace all converted operands for a use.
- for (auto [OpIdx, Op] : enumerate(U->operands())) {
- if (ValMap.contains(Op)) {
- Value *NewVal = nullptr;
- if (BBUseValMap.contains(U->getParent()) &&
- BBUseValMap[U->getParent()].contains(ValMap[Op]))
- NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
- else {
- BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
- NewVal =
- convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
- InsertPt, U->getParent());
- BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
- }
- assert(NewVal);
- U->setOperand(OpIdx, NewVal);
- }
- }
- }
-
- return true;
-}
-
-void LiveRegOptimizer::removeDeadInstrs() {
- // Remove instrs that have been marked dead after type-coercion.
- for (auto *I : DeadInstrs) {
- I->replaceAllUsesWith(PoisonValue::get(I->getType()));
- I->eraseFromParent();
- }
-}
-
bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
unsigned AS = LI.getPointerAddressSpace();
// Skip non-constant address space.
@@ -410,7 +119,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
// Skip non-simple loads.
if (!LI.isSimple())
return false;
- Type *Ty = LI.getType();
+ auto *Ty = LI.getType();
// Skip aggregate types.
if (Ty->isAggregateType())
return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f50a18ccc2188..9162e110aa10b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1197,10 +1197,10 @@ bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
if (TM->getOptLevel() > CodeGenOptLevel::None)
- addPass(createSinkingPass());
+ addPass(createAMDGPULateCodeGenPreparePass());
if (TM->getOptLevel() > CodeGenOptLevel::None)
- addPass(createAMDGPULateCodeGenPreparePass());
+ addPass(createSinkingPass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
deleted file mode 100644
index 83cb92210ec84..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ /dev/null
@@ -1,636 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-
-define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v3i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT: v_mov_b32_e32 v3, 8
-; GFX906-NEXT: v_mov_b32_e32 v5, 16
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v4, v2, s[4:5]
-; GFX906-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v4
-; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB0_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v0, v2, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0
-; GFX906-NEXT: .LBB0_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4
-; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX906-NEXT: v_mov_b32_e32 v1, 0
-; GFX906-NEXT: global_store_short v1, v0, s[2:3]
-; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[2:3] offset:2
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v4i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v1, v2, s[4:5]
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB1_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v1, v2, s[6:7]
-; GFX906-NEXT: .LBB1_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v5i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT: s_cbranch_execz .LBB2_2
-; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT: .LBB2_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v4, 0
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1
-; GFX906-NEXT: global_store_byte v4, v1, s[2:3]
-; GFX906-NEXT: global_store_byte v4, v0, s[2:3] offset:1
-; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[2:3] offset:2
-; GFX906-NEXT: global_store_byte v4, v3, s[2:3] offset:3
-; GFX906-NEXT: global_store_byte v4, v2, s[2:3] offset:4
-; GFX906-NEXT: s_endpgm
-entry:
- %idx = call i32 @llvm.amdgcn.workitem.id.x()
- %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
- %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
- %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
- %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
- %cmp = icmp ult i32 %idx, 15
- br i1 %cmp, label %bb.1, label %bb.2
-bb.1:
- br label %bb.2
-
-bb.2:
- %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
- store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
- ret void
-}
-
-define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v8i8_liveout:
-; GFX906: ; %bb.0: ; %entry
-; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/97138
More information about the llvm-commits
mailing list