[llvm] [CostModel][AArch64] Make extractelement, with fmul user, free whenev… (PR #111479)
Sushant Gokhale via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 02:52:01 PST 2024
https://github.com/sushgokh updated https://github.com/llvm/llvm-project/pull/111479
>From 656f8cb3a68f1e47ea737068faa9904a01b389e4 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Tue, 8 Oct 2024 10:02:40 +0530
Subject: [PATCH] [CostModel][AArch64] Make extractelement, with fmul user,
free whenever possible
In case of Neon, if there exists extractelement from lane != 0 such that
1. extractelement does not necessitate a move from vector_reg -> GPR.
2. extractelement result feeds into fmul.
3. Other operand of fmul is a scalar or extractelement from lane 0 or lane equivalent to 0.
then the extractelement can be merged with fmul in the backend and it incurs no cost.
e.g.
define double @foo(<2 x double> %a) {
%1 = extractelement <2 x double> %a, i32 0
%2 = extractelement <2 x double> %a, i32 1
%res = fmul double %1, %2 ret double %res
}
%2 and %res can be merged in the backend to generate:
fmul d0, d0, v0.d[1]
The change was tested with SPEC FP(C/C++) on Neoverse-v2.
Compile time impact: None
Performance impact: Observing 1.3-1.7% uplift on lbm benchmark with -flto depending upon the config.
---
.../llvm/Analysis/TargetTransformInfo.h | 24 +++
.../llvm/Analysis/TargetTransformInfoImpl.h | 7 +
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 15 +-
llvm/lib/Analysis/TargetTransformInfo.cpp | 13 ++
.../AArch64/AArch64TargetTransformInfo.cpp | 143 +++++++++++++++++-
.../AArch64/AArch64TargetTransformInfo.h | 14 +-
.../Transforms/Vectorize/SLPVectorizer.cpp | 12 +-
.../CostModel/AArch64/extract_float.ll | 29 ++--
.../SLPVectorizer/consecutive-access.ll | 70 +++------
9 files changed, 251 insertions(+), 76 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0dc513d8e65b76..ce4de248b00635 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -22,6 +22,7 @@
#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/IR/FMF.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/PassManager.h"
@@ -1404,6 +1405,16 @@ class TargetTransformInfo {
unsigned Index = -1, Value *Op0 = nullptr,
Value *Op1 = nullptr) const;
+ /// \return The expected cost of vector Insert and Extract.
+ /// Use -1 to indicate that there is no information on the index value.
+ /// This is used when the instruction is not available; a typical use
+ /// case is to provision the cost of vectorization/scalarization in
+ /// vectorizer passes.
+ InstructionCost getVectorInstrCost(
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ Value *Scalar,
+ ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;
+
/// \return The expected cost of vector Insert and Extract.
/// This is used when instruction is available, and implementation
/// asserts 'I' is not nullptr.
@@ -2100,6 +2111,12 @@ class TargetTransformInfo::Concept {
TTI::TargetCostKind CostKind,
unsigned Index, Value *Op0,
Value *Op1) = 0;
+
+ virtual InstructionCost getVectorInstrCost(
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ Value *Scalar,
+ ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) = 0;
+
virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) = 0;
@@ -2785,6 +2802,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
Value *Op1) override {
return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
}
+ InstructionCost getVectorInstrCost(
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ Value *Scalar,
+ ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) override {
+ return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
+ ScalarUserAndIdx);
+ }
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 317c13917c0cfc..727f13b3a3bd5c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -699,6 +699,13 @@ class TargetTransformInfoImplBase {
return 1;
}
+ InstructionCost getVectorInstrCost(
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ Value *Scalar,
+ ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
+ return 1;
+ }
+
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) const {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index b0316e67654dbc..10a55d888ab03c 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -17,7 +17,6 @@
#define LLVM_CODEGEN_BASICTTIIMPL_H
#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -1282,12 +1281,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return 1;
}
- InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Op0, Value *Op1) {
+ virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+ TTI::TargetCostKind CostKind,
+ unsigned Index, Value *Op0,
+ Value *Op1) {
return getRegUsageForType(Val->getScalarType());
}
+ InstructionCost getVectorInstrCost(
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ Value *Scalar,
+ ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
+ return getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr, nullptr);
+ }
+
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c5c7b7c7c0a57f..439a39bef3a707 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1056,6 +1056,19 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
return Cost;
}
+InstructionCost TargetTransformInfo::getVectorInstrCost(
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ Value *Scalar,
+ ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
+ // FIXME: Assert that Opcode is either InsertElement or ExtractElement.
+ // This is mentioned in the interface description and respected by all
+ // callers, but never asserted upon.
+ InstructionCost Cost = TTIImpl->getVectorInstrCost(
+ Opcode, Val, CostKind, Index, Scalar, ScalarUserAndIdx);
+ assert(Cost >= 0 && "TTI should not produce negative costs!");
+ return Cost;
+}
+
InstructionCost
TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 71f9bbbbc35041..6dfa8c2a35935d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -11,6 +11,7 @@
#include "AArch64PerfectShuffle.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64SMEAttributes.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -3177,10 +3178,10 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
return 0;
}
-InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
- Type *Val,
- unsigned Index,
- bool HasRealUse) {
+InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
+ Type *Val, unsigned Index, bool HasRealUse, const Instruction *I,
+ std::optional<unsigned> Opcode, Value *Scalar,
+ ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
assert(Val->isVectorTy() && "This must be a vector type");
if (Index != -1U) {
@@ -3226,6 +3227,128 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
// compile-time considerations.
}
+ // In case of Neon, if there exists extractelement from lane != 0 such that
+ // 1. extractelement does not necessitate a move from vector_reg -> GPR.
+ // 2. extractelement result feeds into fmul.
+ // 3. Other operand of fmul is an extractelement from lane 0 or lane
+ // equivalent to 0.
+ // then the extractelement can be merged with fmul in the backend and it
+ // incurs no cost.
+ // e.g.
+ // define double @foo(<2 x double> %a) {
+ // %1 = extractelement <2 x double> %a, i32 0
+ // %2 = extractelement <2 x double> %a, i32 1
+ // %res = fmul double %1, %2
+ // ret double %res
+ // }
+ // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
+ auto ExtractCanFuseWithFmul = [&]() {
+ // We bail out if the extract is from lane 0.
+ if (Index == 0)
+ return false;
+
+ // Check if the scalar element type of the vector operand of ExtractElement
+ // instruction is one of the allowed types.
+ auto IsAllowedScalarTy = [&](const Type *T) {
+ return T->isFloatTy() || T->isDoubleTy() ||
+ (T->isHalfTy() && ST->hasFullFP16());
+ };
+
+ // Check if the extractelement user is scalar fmul.
+ auto IsUserFMulScalarTy = [](const Value *EEUser) {
+ // Check if the user is scalar fmul.
+ const auto *BO = dyn_cast_if_present<BinaryOperator>(EEUser);
+ return BO && BO->getOpcode() == BinaryOperator::FMul &&
+ !BO->getType()->isVectorTy();
+ };
+
+ // Check if the type constraints on input vector type and result scalar type
+ // of extractelement instruction are satisfied.
+ auto TypeConstraintsOnEESatisfied =
+ [&IsAllowedScalarTy](const Type *VectorTy, const Type *ScalarTy) {
+ return isa<FixedVectorType>(VectorTy) && IsAllowedScalarTy(ScalarTy);
+ };
+
+ // Check if the extract index is from lane 0 or lane equivalent to 0 for a
+ // certain scalar type and a certain vector register width.
+ auto IsExtractLaneEquivalentToZero = [&](const unsigned &Idx,
+ const unsigned &EltSz) {
+ auto RegWidth =
+ getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedValue();
+ return (Idx == 0 || (Idx * EltSz) % RegWidth == 0);
+ };
+
+ if (Opcode.has_value()) {
+ if (!TypeConstraintsOnEESatisfied(Val, Val->getScalarType()))
+ return false;
+
+ DenseMap<User *, unsigned> UserToExtractIdx;
+ for (auto *U : Scalar->users()) {
+ if (!IsUserFMulScalarTy(U))
+ return false;
+ // Recording entry for the user is important. Index value is not
+ // important.
+ UserToExtractIdx[U];
+ }
+ for (auto &[S, U, L] : ScalarUserAndIdx) {
+ for (auto *U : S->users()) {
+ if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) {
+ auto *FMul = cast<BinaryOperator>(U);
+ auto *Op0 = FMul->getOperand(0);
+ auto *Op1 = FMul->getOperand(1);
+ if ((Op0 == S && Op1 == S) || (Op0 != S) || (Op1 != S)) {
+ UserToExtractIdx[U] = L;
+ break;
+ }
+ }
+ }
+ }
+ for (auto &[U, L] : UserToExtractIdx) {
+ if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
+ !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
+ return false;
+ }
+ } else {
+ const auto *EE = cast<ExtractElementInst>(I);
+
+ const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
+ if (!IdxOp)
+ return false;
+
+ if (!TypeConstraintsOnEESatisfied(EE->getVectorOperand()->getType(),
+ EE->getType()))
+ return false;
+
+ return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
+ if (!IsUserFMulScalarTy(U))
+ return false;
+
+ // Check if the other operand of extractelement is also extractelement
+ // from lane equivalent to 0.
+ const auto *BO = cast<BinaryOperator>(U);
+ const auto *OtherEE = dyn_cast<ExtractElementInst>(
+ BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
+ if (OtherEE) {
+ const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
+ if (!IdxOp)
+ return false;
+ return IsExtractLaneEquivalentToZero(
+ cast<ConstantInt>(OtherEE->getIndexOperand())
+ ->getValue()
+ .getZExtValue(),
+ OtherEE->getType()->getScalarSizeInBits());
+ }
+ return true;
+ });
+ }
+ return true;
+ };
+
+ unsigned InstOpcode = I ? I->getOpcode() : Opcode.value();
+ if (InstOpcode == Instruction::ExtractElement && ExtractCanFuseWithFmul())
+ return 0;
+
// All other insert/extracts cost this much.
return ST->getVectorInsertExtractBaseCost();
}
@@ -3236,14 +3359,22 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
Value *Op1) {
bool HasRealUse =
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
- return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
+ return getVectorInstrCostHelper(Val, Index, HasRealUse);
+}
+
+InstructionCost AArch64TTIImpl::getVectorInstrCost(
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ Value *Scalar,
+ ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
+ return getVectorInstrCostHelper(Val, Index, false, nullptr, Opcode, Scalar,
+ ScalarUserAndIdx);
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) {
- return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
+ return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */, &I);
}
InstructionCost AArch64TTIImpl::getScalarizationOverhead(
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 1d09d67f6ec9e3..f30f32203c73a1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -19,7 +19,6 @@
#include "AArch64.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
-#include "llvm/ADT/ArrayRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/IR/Function.h"
@@ -66,8 +65,11 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
// indicates whether the vector instruction is available in the input IR or
// just imaginary in vectorizer passes.
- InstructionCost getVectorInstrCostHelper(const Instruction *I, Type *Val,
- unsigned Index, bool HasRealUse);
+ InstructionCost getVectorInstrCostHelper(
+ Type *Val, unsigned Index, bool HasRealUse,
+ const Instruction *I = nullptr,
+ std::optional<unsigned> Opcode = std::nullopt, Value *Scalar = nullptr,
+ ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {});
public:
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
@@ -185,6 +187,12 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index, Value *Op0, Value *Op1);
+
+ InstructionCost getVectorInstrCost(
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ Value *Scalar,
+ ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx);
+
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4454eb3e34d983..0f199e820aa953 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11977,6 +11977,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
+ // Keep track {Scalar, Index, User} tuple.
+ // On AArch64, this helps in fusing a mov instruction, associated with
+ // extractelement, with fmul in the backend so that extractelement is free.
+ SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
+ for (ExternalUser &EU : ExternalUses) {
+ ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
+ }
for (ExternalUser &EU : ExternalUses) {
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
@@ -12089,8 +12096,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
} else {
- ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
- CostKind, EU.Lane);
+ ExtraCost =
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
+ EU.Lane, EU.Scalar, ScalarUserAndIdx);
}
// Leave the scalar instructions as is if they are cheaper than extracts.
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
diff --git a/llvm/test/Analysis/CostModel/AArch64/extract_float.ll b/llvm/test/Analysis/CostModel/AArch64/extract_float.ll
index dd3d0289bbb1cf..d2b75faa014d68 100644
--- a/llvm/test/Analysis/CostModel/AArch64/extract_float.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/extract_float.ll
@@ -8,7 +8,7 @@
define double @extract_case1(<2 x double> %a) {
; CHECK-LABEL: 'extract_case1'
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 0
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x double> %a, i32 1
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <2 x double> %a, i32 1
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %res
entry:
@@ -57,7 +57,7 @@ entry:
; res = lane 1 * scalar
define double @extract_case5(<2 x double> %a, double %b) {
; CHECK-LABEL: 'extract_case5'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %0 = extractelement <2 x double> %a, i32 1
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 1
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %b
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %res
entry:
@@ -71,7 +71,7 @@ entry:
define double @extract_case6(<3 x double> %a) {
; CHECK-LABEL: 'extract_case6'
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <3 x double> %a, i32 0
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <3 x double> %a, i32 1
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <3 x double> %a, i32 1
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %res
entry:
@@ -86,7 +86,7 @@ entry:
; register. But for other register sizes, this is not the case.
define double @extract_case7(<4 x double> %a) {
; CHECK-LABEL: 'extract_case7'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %0 = extractelement <4 x double> %a, i32 1
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <4 x double> %a, i32 1
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <4 x double> %a, i32 2
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %res
@@ -158,11 +158,17 @@ entry:
; res = lane 0 * lane 1
define half @extract_case11(<2 x half> %a) {
-; CHECK-LABEL: 'extract_case11'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x half> %a, i32 0
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x half> %a, i32 1
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul half %0, %1
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret half %res
+; NOFP16-LABEL: 'extract_case11'
+; NOFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x half> %a, i32 0
+; NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x half> %a, i32 1
+; NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul half %0, %1
+; NOFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret half %res
+;
+; FULLFP16-LABEL: 'extract_case11'
+; FULLFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x half> %a, i32 0
+; FULLFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <2 x half> %a, i32 1
+; FULLFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul half %0, %1
+; FULLFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret half %res
entry:
%1 = extractelement <2 x half> %a, i32 0
%2 = extractelement <2 x half> %a, i32 1
@@ -174,7 +180,7 @@ entry:
define float @extract_case12(<2 x float> %a) {
; CHECK-LABEL: 'extract_case12'
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x float> %a, i32 0
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x float> %a, i32 1
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <2 x float> %a, i32 1
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = fmul float %0, %1
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %res
entry:
@@ -200,6 +206,3 @@ entry:
}
declare void @foo(double)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FULLFP16: {{.*}}
-; NOFP16: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
index 369ca28ece55b8..db24ccc2a5b346 100644
--- a/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
+++ b/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-apple-macosx10.9.0 | FileCheck %s --check-prefixes=CHECK-X86 %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=CHECK-AARCH64 %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-apple-macosx10.9.0 | FileCheck %s --check-prefix=CHECK %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=CHECK %}
@A = common global [2000 x double] zeroinitializer, align 16
@B = common global [2000 x double] zeroinitializer, align 16
@@ -439,54 +439,28 @@ for.end: ; preds = %for.cond.for.end_cr
; Make sure we are able to vectorize this from now on:
;
define double @bar(ptr nocapture readonly %a, i32 %n) local_unnamed_addr #0 {
-; CHECK-X86-LABEL: @bar(
-; CHECK-X86-NEXT: entry:
-; CHECK-X86-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-X86-NEXT: br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
-; CHECK-X86: for.cond.cleanup:
-; CHECK-X86-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-X86-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 0
-; CHECK-X86-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
-; CHECK-X86-NEXT: [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]]
-; CHECK-X86-NEXT: ret double [[MUL]]
-; CHECK-X86: for.body:
-; CHECK-X86-NEXT: [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-X86-NEXT: [[TMP3:%.*]] = phi <2 x double> [ [[TMP5]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ]
-; CHECK-X86-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64
-; CHECK-X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[IDXPROM]]
-; CHECK-X86-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8
-; CHECK-X86-NEXT: [[TMP5]] = fadd <2 x double> [[TMP3]], [[TMP4]]
-; CHECK-X86-NEXT: [[ADD5]] = add i32 [[I_018]], 2
-; CHECK-X86-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]]
-; CHECK-X86-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
+; CHECK-LABEL: @bar(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
+; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret double [[MUL]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x double> [ [[TMP5]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ]
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[ADD5]] = add i32 [[I_018]], 2
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
;
-; CHECK-AARCH64-LABEL: @bar(
-; CHECK-AARCH64-NEXT: entry:
-; CHECK-AARCH64-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-AARCH64-NEXT: br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
-; CHECK-AARCH64: for.cond.cleanup:
-; CHECK-AARCH64-NEXT: [[X_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-AARCH64-NEXT: [[Y_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY]] ]
-; CHECK-AARCH64-NEXT: [[MUL:%.*]] = fmul double [[X_0_LCSSA]], [[Y_0_LCSSA]]
-; CHECK-AARCH64-NEXT: ret double [[MUL]]
-; CHECK-AARCH64: for.body:
-; CHECK-AARCH64-NEXT: [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-AARCH64-NEXT: [[Y_017:%.*]] = phi double [ [[ADD4]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
-; CHECK-AARCH64-NEXT: [[X_016:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
-; CHECK-AARCH64-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64
-; CHECK-AARCH64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[IDXPROM]]
-; CHECK-AARCH64-NEXT: [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; CHECK-AARCH64-NEXT: [[ADD]] = fadd double [[X_016]], [[TMP0]]
-; CHECK-AARCH64-NEXT: [[ADD1:%.*]] = or disjoint i32 [[I_018]], 1
-; CHECK-AARCH64-NEXT: [[IDXPROM2:%.*]] = zext i32 [[ADD1]] to i64
-; CHECK-AARCH64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IDXPROM2]]
-; CHECK-AARCH64-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
-; CHECK-AARCH64-NEXT: [[ADD4]] = fadd double [[Y_017]], [[TMP1]]
-; CHECK-AARCH64-NEXT: [[ADD5]] = add i32 [[I_018]], 2
-; CHECK-AARCH64-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]]
-; CHECK-AARCH64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
-;
entry:
%cmp15 = icmp eq i32 %n, 0
More information about the llvm-commits
mailing list