[llvm] 27bd8f9 - Recommit "[SLP] Fix lookahead operand reordering for splat loads." attempt 2, fixed assertion crash.
Vasileios Porpodas via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 22 17:09:19 PDT 2022
Author: Vasileios Porpodas
Date: 2022-03-22T16:41:55-07:00
New Revision: 27bd8f94928201f87f6b659fc2228efd539e8245
URL: https://github.com/llvm/llvm-project/commit/27bd8f94928201f87f6b659fc2228efd539e8245
DIFF: https://github.com/llvm/llvm-project/commit/27bd8f94928201f87f6b659fc2228efd539e8245.diff
LOG: Recommit "[SLP] Fix lookahead operand reordering for splat loads." attempt 2, fixed assertion crash.
Original review: https://reviews.llvm.org/D121354
This reverts commit f7d7d2a08d16356c57f6d2d36bc2fc0589a55df9.
Added:
Modified:
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/include/llvm/CodeGen/BasicTTIImpl.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
llvm/lib/Target/ARM/ARMTargetTransformInfo.h
llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/lib/Target/X86/X86TargetTransformInfo.h
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 23a3faa4d9687..31eb40e365b63 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -658,6 +658,10 @@ class TargetTransformInfo {
/// Return true if the target supports nontemporal load.
bool isLegalNTLoad(Type *DataType, Align Alignment) const;
+ /// \Returns true if the target supports broadcasting a load to a vector of
+ /// type <NumElements x ElementTy>.
+ bool isLegalBroadcastLoad(Type *ElementTy, unsigned NumElements) const;
+
/// Return true if the target supports masked scatter.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
/// Return true if the target supports masked gather.
@@ -1044,11 +1048,14 @@ class TargetTransformInfo {
/// The exact mask may be passed as Mask, or else the array will be empty.
/// The index and subtype parameters are used by the subvector insertion and
/// extraction shuffle kinds to show the insert/extract point and the type of
- /// the subvector being inserted/extracted.
+ /// the subvector being inserted/extracted. The operands of the shuffle can be
+ /// passed through \p Args, which helps improve the cost estimation in some
+ /// cases, like in broadcast loads.
/// NOTE: For subvector extractions Tp represents the source type.
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask = None, int Index = 0,
- VectorType *SubTp = nullptr) const;
+ VectorType *SubTp = nullptr,
+ ArrayRef<Value *> Args = None) const;
/// Represents a hint about the context in which a cast is used.
///
@@ -1549,6 +1556,8 @@ class TargetTransformInfo::Concept {
virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
+ virtual bool isLegalBroadcastLoad(Type *ElementTy,
+ unsigned NumElements) const = 0;
virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
virtual bool forceScalarizeMaskedGather(VectorType *DataType,
@@ -1659,7 +1668,8 @@ class TargetTransformInfo::Concept {
ArrayRef<const Value *> Args, const Instruction *CxtI = nullptr) = 0;
virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp) = 0;
+ VectorType *SubTp,
+ ArrayRef<Value *> Args) = 0;
virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst,
Type *Src, CastContextHint CCH,
TTI::TargetCostKind CostKind,
@@ -1952,6 +1962,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
bool isLegalNTLoad(Type *DataType, Align Alignment) override {
return Impl.isLegalNTLoad(DataType, Alignment);
}
+ bool isLegalBroadcastLoad(Type *ElementTy,
+ unsigned NumElements) const override {
+ return Impl.isLegalBroadcastLoad(ElementTy, NumElements);
+ }
bool isLegalMaskedScatter(Type *DataType, Align Alignment) override {
return Impl.isLegalMaskedScatter(DataType, Alignment);
}
@@ -2179,8 +2193,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
}
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp) override {
- return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp);
+ VectorType *SubTp,
+ ArrayRef<Value *> Args) override {
+ return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp, Args);
}
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
CastContextHint CCH,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 45990266fb30e..806014ee54fdc 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -256,6 +256,10 @@ class TargetTransformInfoImplBase {
return Alignment >= DataSize && isPowerOf2_32(DataSize);
}
+ bool isLegalBroadcastLoad(Type *ElementTy, unsigned NumElements) const {
+ return false;
+ }
+
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const {
return false;
}
@@ -488,7 +492,8 @@ class TargetTransformInfoImplBase {
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp) const {
+ VectorType *SubTp,
+ ArrayRef<Value *> Args = None) const {
return 1;
}
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 8e62dfff117b5..d3b2272090d4d 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -871,7 +871,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp) {
+ VectorType *SubTp,
+ ArrayRef<Value *> Args = None) {
switch (improveShuffleKindFromMask(Kind, Mask)) {
case TTI::SK_Broadcast:
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index ae2221433eca7..804331ecfa14d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -396,6 +396,11 @@ bool TargetTransformInfo::isLegalNTLoad(Type *DataType, Align Alignment) const {
return TTIImpl->isLegalNTLoad(DataType, Alignment);
}
+bool TargetTransformInfo::isLegalBroadcastLoad(Type *ElementTy,
+ unsigned NumElements) const {
+ return TTIImpl->isLegalBroadcastLoad(ElementTy, NumElements);
+}
+
bool TargetTransformInfo::isLegalMaskedGather(Type *DataType,
Align Alignment) const {
return TTIImpl->isLegalMaskedGather(DataType, Alignment);
@@ -740,12 +745,11 @@ InstructionCost TargetTransformInfo::getArithmeticInstrCost(
return Cost;
}
-InstructionCost TargetTransformInfo::getShuffleCost(ShuffleKind Kind,
- VectorType *Ty,
- ArrayRef<int> Mask,
- int Index,
- VectorType *SubTp) const {
- InstructionCost Cost = TTIImpl->getShuffleCost(Kind, Ty, Mask, Index, SubTp);
+InstructionCost TargetTransformInfo::getShuffleCost(
+ ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask, int Index,
+ VectorType *SubTp, ArrayRef<Value *> Args) const {
+ InstructionCost Cost =
+ TTIImpl->getShuffleCost(Kind, Ty, Mask, Index, SubTp, Args);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 2cd3c93167f14..b9f6f49dd04db 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2604,7 +2604,8 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp) {
+ VectorType *SubTp,
+ ArrayRef<Value *> Args) {
Kind = improveShuffleKindFromMask(Kind, Mask);
if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6029b9f24456..92005b3ba40c9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -330,7 +330,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp);
+ VectorType *SubTp,
+ ArrayRef<Value *> Args = None);
/// @}
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a8df7789c8a19..bdd22a4614f4f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1042,7 +1042,8 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *VT, ArrayRef<int> Mask,
- int Index, VectorType *SubTp) {
+ int Index, VectorType *SubTp,
+ ArrayRef<Value *> Args) {
Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasVOP3PInsts()) {
if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index e901b5c5747d6..4743042f5faea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -201,7 +201,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp);
+ VectorType *SubTp,
+ ArrayRef<Value *> Args = None);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index d9d563ead2605..7e802238b362e 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1202,7 +1202,8 @@ InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) {
InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *Tp, ArrayRef<int> Mask,
- int Index, VectorType *SubTp) {
+ int Index, VectorType *SubTp,
+ ArrayRef<Value *> Args) {
Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasNEON()) {
if (Kind == TTI::SK_Broadcast) {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 5bb84899e5ef0..3139c412aeb87 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -213,7 +213,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp);
+ VectorType *SubTp,
+ ArrayRef<Value *> Args = None);
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 1bdd8c3c513a1..280d8f19725b4 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -223,7 +223,8 @@ HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
InstructionCost HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
ArrayRef<int> Mask, int Index,
- Type *SubTp) {
+ Type *SubTp,
+ ArrayRef<Value *> Args) {
return 1;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 9e637dfc3e161..65eb9d9fb5bbe 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -125,7 +125,8 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind);
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
- ArrayRef<int> Mask, int Index, Type *SubTp);
+ ArrayRef<int> Mask, int Index, Type *SubTp,
+ ArrayRef<Value *> Args = None);
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
const Value *Ptr, bool VariableMask,
Align Alignment,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index cc5738a5d7b63..1ee960fcdbc73 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -1015,7 +1015,8 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost(
InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
ArrayRef<int> Mask, int Index,
- Type *SubTp) {
+ Type *SubTp,
+ ArrayRef<Value *> Args) {
InstructionCost CostFactor =
vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 0af6f2a308d91..61cb689c352fa 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -111,7 +111,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
- ArrayRef<int> Mask, int Index, Type *SubTp);
+ ArrayRef<int> Mask, int Index, Type *SubTp,
+ ArrayRef<Value *> Args = None);
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 6721a0a258d3d..5f797f1dae7f8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -175,7 +175,8 @@ InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) {
InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *Tp, ArrayRef<int> Mask,
- int Index, VectorType *SubTp) {
+ int Index, VectorType *SubTp,
+ ArrayRef<Value *> Args) {
if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
return getSpliceCost(Tp, Index);
return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 63c7ed0bbdbb7..9088d4847f3d9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -80,7 +80,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
InstructionCost getSpliceCost(VectorType *Tp, int Index);
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp);
+ VectorType *SubTp,
+ ArrayRef<Value *> Args = None);
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
const Value *Ptr, bool VariableMask,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 6d66ebfced05e..5e0c4805c5091 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -559,7 +559,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp) {
+ VectorType *SubTp,
+ ArrayRef<Value *> Args) {
Kind = improveShuffleKindFromMask(Kind, Mask);
if (ST->hasVector()) {
unsigned NumVectors = getNumVectorRegs(Tp);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index db4ec794b3e4a..acc9aee0c8fe5 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -92,7 +92,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
const Instruction *CxtI = nullptr);
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp);
+ VectorType *SubTp,
+ ArrayRef<Value *> Args = None);
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 32f9f56184d47..ed3a7e2d5c238 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1085,7 +1085,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *BaseTp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp) {
+ VectorType *SubTp,
+ ArrayRef<Value *> Args) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are widened to type v4i32.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
@@ -1545,9 +1546,27 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
};
- if (ST->hasSSE2())
+ static const CostTblEntry SSE3BroadcastLoadTbl[] = {
+ {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
+ };
+
+ if (ST->hasSSE2()) {
+ bool IsLoad = !Args.empty() && llvm::all_of(Args, [](const Value *V) {
+ return isa<LoadInst>(V);
+ });
+ if (ST->hasSSE3() && IsLoad)
+ if (const auto *Entry =
+ CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
+ assert(isLegalBroadcastLoad(
+ BaseTp->getElementType(),
+ cast<FixedVectorType>(BaseTp)->getNumElements()) &&
+ "Table entry missing from isLegalBroadcastLoad()");
+ return LT.first * Entry->Cost;
+ }
+
if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
+ }
static const CostTblEntry SSE1ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
@@ -5118,6 +5137,13 @@ bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
return true;
}
+bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy,
+ unsigned NumElements) const {
+ // movddup
+ return ST->hasSSE3() && NumElements == 2 &&
+ ElementTy == Type::getDoubleTy(ElementTy->getContext());
+}
+
bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
if (!isa<VectorType>(DataTy))
return false;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index a8909ee6457de..d262835dd44a4 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -131,7 +131,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
const Instruction *CxtI = nullptr);
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask, int Index,
- VectorType *SubTp);
+ VectorType *SubTp,
+ ArrayRef<Value *> Args = None);
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
@@ -226,6 +227,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
bool isLegalMaskedStore(Type *DataType, Align Alignment);
bool isLegalNTLoad(Type *DataType, Align Alignment);
bool isLegalNTStore(Type *DataType, Align Alignment);
+ bool isLegalBroadcastLoad(Type *ElementTy, unsigned NumElements) const;
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment);
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
return forceScalarizeMaskedGather(VTy, Alignment);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 926b76f9da8d2..c6f1e68cb89c0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1136,6 +1136,11 @@ class BoUpSLP {
/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreConsecutiveLoads = 4;
+ /// The same load multiple times. This should have a better score than
+ /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
+ /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
+ /// a vector load and 1.0 for a broadcast.
+ static const int ScoreSplatLoads = 3;
/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreReversedLoads = 3;
/// ExtractElementInst from same vector and consecutive indexes.
@@ -1162,9 +1167,18 @@ class BoUpSLP {
/// MainAltOps.
static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
ScalarEvolution &SE, int NumLanes,
- ArrayRef<Value *> MainAltOps) {
- if (V1 == V2)
+ ArrayRef<Value *> MainAltOps,
+ const TargetTransformInfo *TTI) {
+ if (V1 == V2) {
+ if (isa<LoadInst>(V1)) {
+ // A broadcast of a load can be cheaper on some targets.
+ // TODO: For now accept a broadcast load with no other internal uses.
+ if (TTI->isLegalBroadcastLoad(V1->getType(), NumLanes) &&
+ (int)V1->getNumUses() == NumLanes)
+ return VLOperands::ScoreSplatLoads;
+ }
return VLOperands::ScoreSplat;
+ }
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
@@ -1343,7 +1357,7 @@ class BoUpSLP {
// Get the shallow score of V1 and V2.
int ShallowScoreAtThisLevel =
- getShallowScore(LHS, RHS, DL, SE, getNumLanes(), MainAltOps);
+ getShallowScore(LHS, RHS, DL, SE, getNumLanes(), MainAltOps, R.TTI);
// If reached MaxLevel,
// or if V1 and V2 are not instructions,
@@ -5237,7 +5251,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// broadcast.
assert(VecTy == FinalVecTy &&
"No reused scalars expected for broadcast.");
- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
+ /*Mask=*/None, /*Index=*/0,
+ /*SubTp=*/nullptr, /*Args=*/VL);
}
InstructionCost ReuseShuffleCost = 0;
if (NeedToShuffleReuses)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
index 6c456bb77da6e..0054520cff567 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -643,32 +643,62 @@ define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
; Same as @ChecksExtractScores, but the extratelement vector operands do not match.
define void @ChecksExtractScores_
diff erent_vectors(double* %storeArray, double* %array, <2 x double> *%vecPtr1, <2 x double>* %vecPtr2, <2 x double>* %vecPtr3, <2 x double>* %vecPtr4) {
-; CHECK-LABEL: @ChecksExtractScores_
diff erent_vectors(
-; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
-; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
-; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
-; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
-; CHECK-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0
-; CHECK-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1
-; CHECK-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4
-; CHECK-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4
-; CHECK-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0
-; CHECK-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRA1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRB0]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]]
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP8]]
-; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
-; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
-; CHECK-NEXT: ret void
+; SSE-LABEL: @ChecksExtractScores_
diff erent_vectors(
+; SSE-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
+; SSE-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
+; SSE-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
+; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
+; SSE-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
+; SSE-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
+; SSE-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0
+; SSE-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1
+; SSE-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4
+; SSE-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4
+; SSE-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0
+; SSE-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1
+; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRA1]], i32 0
+; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRB0]], i32 1
+; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]]
+; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0
+; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1
+; SSE-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP2]]
+; SSE-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP8]]
+; SSE-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
+; SSE-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
+; SSE-NEXT: [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
+; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @ChecksExtractScores_
diff erent_vectors(
+; AVX-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
+; AVX-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
+; AVX-NEXT: [[LOADA0:%.*]] = load double, double* [[IDX0]], align 4
+; AVX-NEXT: [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4
+; AVX-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
+; AVX-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
+; AVX-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0
+; AVX-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1
+; AVX-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4
+; AVX-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4
+; AVX-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0
+; AVX-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1
+; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0
+; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[EXTRA1]], i32 1
+; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
+; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[LOADA0]], i32 1
+; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRB0]], i32 0
+; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1
+; AVX-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
+; AVX-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[LOADA1]], i32 1
+; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP7]], [[TMP9]]
+; AVX-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]]
+; AVX-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
+; AVX-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
+; AVX-NEXT: [[TMP12:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
+; AVX-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
+; AVX-NEXT: ret void
;
%idx0 = getelementptr inbounds double, double* %array, i64 0
%idx1 = getelementptr inbounds double, double* %array, i64 1
@@ -701,28 +731,50 @@ define void @ChecksExtractScores_
diff erent_vectors(double* %storeArray, double*
; This checks that we we prefer splats rather than reverse load vectors + shuffles.
; 2-wide splat loads in x86 use a single instruction so they are quite cheap.
define double @splat_loads(double *%array1, double *%array2, double *%ptrA, double *%ptrB) {
-; CHECK-LABEL: @splat_loads(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_1_0:%.*]] = getelementptr inbounds double, double* [[ARRAY1:%.*]], i64 0
-; CHECK-NEXT: [[GEP_1_1:%.*]] = getelementptr inbounds double, double* [[ARRAY1]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>*
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds double, double* [[ARRAY2:%.*]], i64 0
-; CHECK-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, double* [[ARRAY2]], i64 1
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[GEP_2_0]] to <2 x double>*
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1
-; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP1]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP4]], [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
-; CHECK-NEXT: [[ADD3:%.*]] = fadd double [[TMP11]], [[TMP12]]
-; CHECK-NEXT: ret double [[ADD3]]
+; SSE-LABEL: @splat_loads(
+; SSE-NEXT: entry:
+; SSE-NEXT: [[GEP_1_0:%.*]] = getelementptr inbounds double, double* [[ARRAY1:%.*]], i64 0
+; SSE-NEXT: [[GEP_1_1:%.*]] = getelementptr inbounds double, double* [[ARRAY1]], i64 1
+; SSE-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>*
+; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; SSE-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds double, double* [[ARRAY2:%.*]], i64 0
+; SSE-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, double* [[ARRAY2]], i64 1
+; SSE-NEXT: [[TMP2:%.*]] = bitcast double* [[GEP_2_0]] to <2 x double>*
+; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]]
+; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1
+; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
+; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0
+; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1
+; SSE-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP1]], [[TMP8]]
+; SSE-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP4]], [[TMP9]]
+; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
+; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
+; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP11]], [[TMP12]]
+; SSE-NEXT: ret double [[ADD3]]
+;
+; AVX-LABEL: @splat_loads(
+; AVX-NEXT: entry:
+; AVX-NEXT: [[GEP_1_0:%.*]] = getelementptr inbounds double, double* [[ARRAY1:%.*]], i64 0
+; AVX-NEXT: [[GEP_1_1:%.*]] = getelementptr inbounds double, double* [[ARRAY1]], i64 1
+; AVX-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>*
+; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; AVX-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds double, double* [[ARRAY2:%.*]], i64 0
+; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, double* [[ARRAY2]], i64 1
+; AVX-NEXT: [[LD_2_0:%.*]] = load double, double* [[GEP_2_0]], align 8
+; AVX-NEXT: [[LD_2_1:%.*]] = load double, double* [[GEP_2_1]], align 8
+; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
+; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_0]], i32 1
+; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0
+; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD_2_1]], i32 1
+; AVX-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP1]], [[TMP6]]
+; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], [[TMP7]]
+; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
+; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
+; AVX-NEXT: [[ADD3:%.*]] = fadd double [[TMP9]], [[TMP10]]
+; AVX-NEXT: ret double [[ADD3]]
;
entry:
%gep_1_0 = getelementptr inbounds double, double* %array1, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 51d564f0bfacc..487751877f880 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -253,13 +253,16 @@ define void @vecload_vs_broadcast4(double * noalias %from, double * noalias %to,
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1
-; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4
+; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
+; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4
+; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
@@ -306,13 +309,16 @@ define void @shuffle_nodes_match2(double * noalias %from, double * noalias %to,
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1
-; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4
+; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
+; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4
+; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
More information about the llvm-commits
mailing list