[llvm] [AMDGPU] Enable vectorization of i8 values. (PR #134934)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 8 14:45:57 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Gheorghe-Teodor Bercea (doru1004)
<details>
<summary>Changes</summary>
This patch enabled the vectorization of i8 values for AMD GPUs.
---
Patch is 164.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134934.diff
11 Files Affected:
- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+7)
- (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+2)
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+22-3)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (+9)
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+15-4)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll (+28-28)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/div.ll (+42-42)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/rem.ll (+42-42)
- (modified) llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll (+216-216)
- (added) llvm/test/CodeGen/AMDGPU/vectorize-i8-as-i32.ll (+88)
``````````diff
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2efca0d1d754f..17aa46774194a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1907,6 +1907,10 @@ class TargetTransformInfo {
/// pad to. Default is no padding.
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
+ /// \return Returns true if vectorizing 4 x i8s into an i32 is possible.
+ /// Currently only used by the SLP vectorizer.
+ bool canVectorizei8s() const;
+
/// @}
/// Collect kernel launch bounds for \p F into \p LB.
@@ -2363,6 +2367,7 @@ class TargetTransformInfo::Concept {
virtual void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
+ virtual bool canVectorizei8s() const = 0;
};
template <typename T>
@@ -3229,6 +3234,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
Impl.collectKernelLaunchBounds(F, LB);
}
+
+ bool canVectorizei8s() const override { return Impl.canVectorizei8s(); }
};
template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3fe0a9101fdee..9ab97328370bd 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1063,6 +1063,8 @@ class TargetTransformInfoImplBase {
unsigned getMaxNumArgs() const { return UINT_MAX; }
+ bool canVectorizei8s() const { return false; }
+
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
return 0;
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4fea4e5711f5a..ee0c3a9329a1f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1419,6 +1419,10 @@ unsigned TargetTransformInfo::getMaxNumArgs() const {
return TTIImpl->getMaxNumArgs();
}
+bool TargetTransformInfo::canVectorizei8s() const {
+ return TTIImpl->canVectorizei8s();
+}
+
bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
return TTIImpl->shouldExpandReduction(II);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09f7877b13b3a..9c29402755e53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;
- return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
- : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
- : 1;
+ return ElemWidth == 8 ? 4
+ : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+ : 1;
}
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1120,6 +1121,17 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
}
+InstructionCost GCNTTIImpl::getScalarizationOverhead(
+ VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
+ unsigned NumVectorElts = cast<FixedVectorType>(InTy)->getNumElements();
+ if (NumVectorElts > 1 &&
+ InTy->getElementType() == IntegerType::getInt8Ty(InTy->getContext()))
+ return 0;
+ return BaseT::getScalarizationOverhead(InTy, DemandedElts, Insert, Extract,
+ CostKind, VL);
+}
+
InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *VT, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind,
@@ -1134,6 +1146,11 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+
+ if (NumVectorElts > 1 &&
+ VT->getElementType() == IntegerType::getInt8Ty(VT->getContext()))
+ return 0;
+
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
bool HasVOP3P = ST->hasVOP3PInsts();
@@ -1423,3 +1440,5 @@ void GCNTTIImpl::collectKernelLaunchBounds(
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
}
+
+bool GCNTTIImpl::canVectorizei8s() const { return true; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index f5062070ac6f4..239becb9aab15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -240,6 +240,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionCost getVectorSplitCost() { return 0; }
+ InstructionCost getScalarizationOverhead(VectorType *InTy,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind,
+ ArrayRef<Value *> VL = {});
+
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
@@ -282,6 +288,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+ /// \return return true if we can pack 4 i8s into an i32.
+ bool canVectorizei8s() const;
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8d411f2cb203a..efa1e77e4a2ce 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12971,9 +12971,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
LI0->getPointerAddressSpace(), CostKind);
} else {
- VecLdCost = TTI->getMemoryOpCost(
- Instruction::Load, VecTy, LI0->getAlign(),
- LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+ if (VecTy->getElementType() ==
+ IntegerType::getInt8Ty(VecTy->getContext()) &&
+ TTI->canVectorizei8s()) {
+ VecLdCost = 1;
+ } else {
+ VecLdCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, LI0->getAlign(),
+ LI0->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo());
+ }
}
break;
case TreeEntry::StridedVectorize: {
@@ -20927,7 +20934,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
auto *VecTy = getWidenedType(ScalarTy, VF);
- if (TTI->getNumberOfParts(VecTy) == VF)
+ unsigned NumParts = TTI->getNumberOfParts(VecTy);
+ if (TTI->canVectorizei8s() &&
+ VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
+ NumParts = 1;
+ if (NumParts == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned ActualVF = std::min(MaxInst - I, VF);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
index 2b3728b556d9e..2984c616e8f21 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
@@ -47,13 +47,13 @@ define i32 @umax(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -72,13 +72,13 @@ define i32 @umax(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -153,13 +153,13 @@ define i32 @umin(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
@@ -178,13 +178,13 @@ define i32 @umin(i32 %arg) {
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/div.ll b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
index 459c41a0b3eb5..1d21b66166f62 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/div.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/div.ll
@@ -44,9 +44,9 @@ define i32 @sdiv() {
; SLOW-NEXT: Co...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/134934
More information about the llvm-commits
mailing list