[llvm] [SLP]Improve/fix extracts calculations for non-power-of-2 elements. (PR #93213)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 23 09:35:04 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
One of the previous patches introduced initial support for non-power-of-2
number of elements but some parts of the SLP vectorizer still were not
adjusted to handle the costs correctly. Patch fixes it by improving
analysis of the non-power-of-2 number of elements and fixes in the cost
of the extractelements instructions.
---
Patch is 59.99 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93213.diff
8 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+95-52)
- (modified) llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll (+5-18)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll (+60-84)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll (+13-15)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll (+13-15)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll (+14-60)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/hadd.ll (+14-60)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll (-4)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 08ecbe304429e..f044a8cdd2f31 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -255,6 +255,21 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
return isConstant(I->getOperand(2));
}
+/// Returns power-of-2 number of elements in a single register (part), given the
+/// total number of elements \p Size and number of registers (parts) \p
+/// NumParts.
+static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
+ return PowerOf2Ceil(divideCeil(Size, NumParts));
+}
+
+/// Returns correct remaining number of elements, considering total amount \p
+/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
+/// and current register (part) \p Part.
+static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
+ unsigned Part) {
+ return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
+}
+
#if !defined(NDEBUG)
/// Print a short descriptor of the instruction bundle suitable for debug output.
static std::string shortBundleName(ArrayRef<Value *> VL) {
@@ -4081,7 +4096,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
const int VF = GetVF(I);
if (VF == 0)
continue;
- MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
+ unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
+ MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
// Shuffle of at least 2 vectors - ignore.
if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
std::fill(Slice.begin(), Slice.end(), NumScalars);
@@ -4091,7 +4107,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
// Try to include as much elements from the mask as possible.
int FirstMin = INT_MAX;
int SecondVecFound = false;
- for (int K : seq<int>(0, PartSz)) {
+ for (int K : seq<int>(Limit)) {
int Idx = Mask[I * PartSz + K];
if (Idx == PoisonMaskElem) {
Value *V = GatheredScalars[I * PartSz + K];
@@ -4116,7 +4132,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
ShuffledSubMasks.set(I);
continue;
}
- for (int K : seq<int>(0, PartSz)) {
+ for (int K : seq<int>(Limit)) {
int Idx = Mask[I * PartSz + K];
if (Idx == PoisonMaskElem)
continue;
@@ -4139,14 +4155,15 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
}
}
};
- int PartSz = NumScalars / NumParts;
+ int PartSz = getPartNumElems(NumScalars, NumParts);
if (!ExtractShuffles.empty())
TransformMaskToOrder(
CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
if (!ExtractShuffles[I])
return 0U;
unsigned VF = 0;
- for (unsigned Idx : seq<unsigned>(0, PartSz)) {
+ unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
+ for (unsigned Idx : seq<unsigned>(Sz)) {
int K = I * PartSz + Idx;
if (ExtractMask[K] == PoisonMaskElem)
continue;
@@ -4777,12 +4794,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
::addMask(ReorderMask, TE.ReuseShuffleIndices);
unsigned VF = ReorderMask.size();
OrdersType ResOrder(VF, VF);
- unsigned NumParts = VF / Sz;
+ unsigned NumParts = divideCeil(VF, Sz);
SmallBitVector UsedVals(NumParts);
for (unsigned I = 0; I < VF; I += Sz) {
int Val = PoisonMaskElem;
unsigned UndefCnt = 0;
- if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
+ unsigned Limit = std::min(Sz, VF - I);
+ if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
[&](int Idx) {
if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
Val = Idx;
@@ -8281,19 +8299,18 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
return Sz;
return std::max(Sz, VecTy->getNumElements());
});
- unsigned NumSrcRegs =
- TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts));
- if (NumSrcRegs == 0)
- NumSrcRegs = 1;
// FIXME: this must be moved to TTI for better estimation.
- unsigned EltsPerVector = PowerOf2Ceil(std::max(
- divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
+ unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
auto CheckPerRegistersShuffle =
- [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
+ [&](MutableArrayRef<int> Mask,
+ SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
+ if (NumElts <= EltsPerVector)
+ return std::nullopt;
DenseSet<int> RegIndices;
// Check that if trying to permute same single/2 input vectors.
TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
int FirstRegId = -1;
+ Indices.assign(1, -1);
for (int &I : Mask) {
if (I == PoisonMaskElem)
continue;
@@ -8303,8 +8320,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
RegIndices.insert(RegId);
if (RegIndices.size() > 2)
return std::nullopt;
- if (RegIndices.size() == 2)
+ if (RegIndices.size() == 2) {
ShuffleKind = TTI::SK_PermuteTwoSrc;
+ if (Indices.size() == 1)
+ Indices.push_back(-1);
+ }
+ if (RegId == FirstRegId)
+ Indices.front() = I % NumElts;
+ else
+ Indices.back() = I % NumElts;
I = (I % NumElts) % EltsPerVector +
(RegId == FirstRegId ? 0 : EltsPerVector);
}
@@ -8315,22 +8339,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
// Process extracts in blocks of EltsPerVector to check if the source vector
// operand can be re-used directly. If not, add the cost of creating a
// shuffle to extract the values into a vector register.
- for (unsigned Part = 0; Part < NumParts; ++Part) {
+ for (unsigned Part : seq<unsigned>(NumParts)) {
if (!ShuffleKinds[Part])
continue;
- ArrayRef<int> MaskSlice =
- Mask.slice(Part * EltsPerVector,
- (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
- ? Mask.size() % EltsPerVector
- : EltsPerVector);
+ ArrayRef<int> MaskSlice = Mask.slice(
+ Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
copy(MaskSlice, SubMask.begin());
+ SmallVector<int> Indices;
std::optional<TTI::ShuffleKind> RegShuffleKind =
- CheckPerRegistersShuffle(SubMask);
+ CheckPerRegistersShuffle(SubMask, Indices);
if (!RegShuffleKind) {
- Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
- FixedVectorType::get(ScalarTy, NumElts),
- MaskSlice);
+ if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
+ !ShuffleVectorInst::isIdentityMask(
+ MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
+ Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
+ FixedVectorType::get(ScalarTy, NumElts),
+ MaskSlice);
continue;
}
if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
@@ -8339,6 +8364,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
FixedVectorType::get(ScalarTy, EltsPerVector),
SubMask);
}
+ for (int Idx : Indices) {
+ Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
+ FixedVectorType::get(ScalarTy, NumElts),
+ std::nullopt, CostKind, Idx,
+ FixedVectorType::get(ScalarTy, EltsPerVector));
+ }
}
return Cost;
}
@@ -8366,11 +8397,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
InVectors.front().get<const TreeEntry *>() == &E1 &&
InVectors.back().get<const TreeEntry *>() == E2) ||
(!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
- assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
+ unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
+ assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
[](int Idx) { return Idx == PoisonMaskElem; }) &&
"Expected all poisoned elements.");
- ArrayRef<int> SubMask =
- ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
+ ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
return;
}
@@ -8690,10 +8721,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
});
});
SmallPtrSet<Value *, 4> UniqueBases;
- unsigned SliceSize = VL.size() / NumParts;
- for (unsigned Part = 0; Part < NumParts; ++Part) {
- ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
- for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
+ unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+ for (unsigned Part : seq<unsigned>(NumParts)) {
+ unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
+ ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
+ for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
// Ignore non-extractelement scalars.
if (isa<UndefValue>(V) ||
(!SubMask.empty() && SubMask[I] == PoisonMaskElem))
@@ -8790,7 +8822,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
if (NumParts == 0 || NumParts >= Mask.size())
NumParts = 1;
- unsigned SliceSize = Mask.size() / NumParts;
+ unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
const auto *It =
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -8807,7 +8839,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
if (NumParts == 0 || NumParts >= Mask.size())
NumParts = 1;
- unsigned SliceSize = Mask.size() / NumParts;
+ unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
const auto *It =
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -10664,12 +10696,12 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
Mask.assign(VL.size(), PoisonMaskElem);
- unsigned SliceSize = VL.size() / NumParts;
- for (unsigned Part = 0; Part < NumParts; ++Part) {
+ unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+ for (unsigned Part : seq<unsigned>(NumParts)) {
// Scan list of gathered scalars for extractelements that can be represented
// as shuffles.
- MutableArrayRef<Value *> SubVL =
- MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
+ MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
+ Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
SmallVector<int> SubMask;
std::optional<TTI::ShuffleKind> Res =
tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
@@ -11073,10 +11105,11 @@ BoUpSLP::isGatherShuffledEntry(
"Expected only single user of the gather node.");
assert(VL.size() % NumParts == 0 &&
"Number of scalars must be divisible by NumParts.");
- unsigned SliceSize = VL.size() / NumParts;
+ unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
SmallVector<std::optional<TTI::ShuffleKind>> Res;
- for (unsigned Part = 0; Part < NumParts; ++Part) {
- ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
+ for (unsigned Part : seq<unsigned>(NumParts)) {
+ ArrayRef<Value *> SubVL =
+ VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
std::optional<TTI::ShuffleKind> SubRes =
isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
@@ -11679,11 +11712,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
// into a long virtual vector register, forming the original vector.
Value *Vec = nullptr;
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
- unsigned SliceSize = E->Scalars.size() / NumParts;
- for (unsigned Part = 0; Part < NumParts; ++Part) {
+ unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
+ for (unsigned Part : seq<unsigned>(NumParts)) {
+ unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
ArrayRef<Value *> VL =
- ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
- MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+ ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
+ MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
constexpr int MaxBases = 2;
SmallVector<Value *, MaxBases> Bases(MaxBases);
#ifndef NDEBUG
@@ -11720,7 +11754,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
assert((Part == 0 || all_of(seq<unsigned>(0, Part),
[&](unsigned P) {
ArrayRef<int> SubMask =
- Mask.slice(P * SliceSize, SliceSize);
+ Mask.slice(P * SliceSize,
+ getNumElems(Mask.size(),
+ SliceSize, P));
return all_of(SubMask, [](int Idx) {
return Idx == PoisonMaskElem;
});
@@ -12104,13 +12140,19 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
Idx == 0) ||
(Mask.size() == InputVF &&
ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
- std::iota(std::next(Mask.begin(), I * SliceSize),
- std::next(Mask.begin(), (I + 1) * SliceSize), 0);
+ std::iota(
+ std::next(Mask.begin(), I * SliceSize),
+ std::next(Mask.begin(),
+ I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
+ 0);
} else {
unsigned IVal =
*find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
- std::fill(std::next(Mask.begin(), I * SliceSize),
- std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
+ std::fill(
+ std::next(Mask.begin(), I * SliceSize),
+ std::next(Mask.begin(),
+ I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
+ IVal);
}
return true;
};
@@ -12370,7 +12412,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
}
if (!GatherShuffles.empty()) {
- unsigned SliceSize = E->Scalars.size() / NumParts;
+ unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
for (const auto [I, TEs] : enumerate(Entries)) {
if (TEs.empty()) {
@@ -12380,7 +12422,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
assert((TEs.size() == 1 || TEs.size() == 2) &&
"Expected shuffle of 1 or 2 entries.");
- auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
+ unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
+ auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
VecMask.assign(VecMask.size(), PoisonMaskElem);
copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
if (TEs.size() == 1) {
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
index dfa8be9741779..aceee8840bb40 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
@@ -56,25 +56,12 @@ define half @reduction_half16(<16 x half> %vec16) {
;
; VI-LABEL: @reduction_half16(
; VI-NEXT: entry:
-; VI-NEXT: [[ELT8:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 8
-; VI-NEXT: [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9
-; VI-NEXT: [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10
-; VI-NEXT: [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11
-; VI-NEXT: [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
-; VI-NEXT: [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
-; VI-NEXT: [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
-; VI-NEXT: [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
-; VI-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VI-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; VI-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP0]])
-; VI-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT8]]
-; VI-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[ELT9]], [[ELT10]]
-; VI-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[ELT11]], [[ELT12]]
-; VI-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[ELT13]], [[ELT14]]
-; VI-NEXT: [[OP_RDX4:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
-; VI-NEXT: [[OP_RDX5:%.*]] = fadd fast half [[OP_RDX2]], [[OP_RDX3]]
-; VI-NEXT: [[OP_RDX6:%.*]] = fadd fast half [[OP_RDX4]], [[OP_RDX5]]
-; VI-NEXT: [[OP_RDX7:%.*]] = fadd fast half [[OP_RDX6]], [[ELT15]]
-; VI-NEXT: ret half [[OP_RDX7]]
+; VI-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VI-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP2]])
+; VI-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
+; VI-NEXT: ret half [[OP_RDX]]
;
entry:
%elt0 = extractelement <16 x half> %vec16, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 059e4c38b519b..9608608a18098 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -155,13 +155,11 @@ define <4 x float> @exp_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/93213
More information about the llvm-commits
mailing list