[llvm] [LV] Teach the vectorizer to cost and vectorize llvm.sincos intrinsics (PR #123210)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 19 10:06:39 PST 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/123210
>From 3d770daeed86b79833025c2fa0cca624d840c9f0 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 16 Jan 2025 14:23:25 +0000
Subject: [PATCH 1/3] [LV] Teach the vectorizer to cost and vectorize
llvm.sincos intrinsics
This teaches the loop vectorizer that `llvm.sincos` is trivially
vectorizable. Additionally, this patch updates the cost model to cost
intrinsics that return multiple values correctly. Previously, the cost
model only thought intrinsics that return `VectorType` need scalarizing,
which meant it cost intrinsics that return multiple vectors (that need
scalarizing) way too cheap (giving it the cost of a single function
call).
The `llvm.sincos` intrinsic also has a custom cost when a vector
function library is available, as certain VFs can be expanded (later in
code-gen) to a vector function, reducing the cost to a single call (+
the possible loads from the vector function returns values via output
pointers).
---
.../llvm/Analysis/TargetTransformInfo.h | 5 +-
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 88 +++++++--
llvm/lib/Analysis/TargetTransformInfo.cpp | 13 +-
llvm/lib/Analysis/VectorUtils.cpp | 2 +
.../Transforms/Vectorize/LoopVectorize.cpp | 3 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 +-
llvm/test/Analysis/CostModel/AMDGPU/frexp.ll | 56 +++---
.../LoopVectorize/AArch64/llvm.sincos.ll | 170 ++++++++++++++++++
.../Transforms/Scalarizer/deinterleave2.ll | 17 ++
llvm/test/Transforms/Scalarizer/sincos.ll | 17 --
10 files changed, 301 insertions(+), 73 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/llvm.sincos.ll
create mode 100644 llvm/test/Transforms/Scalarizer/deinterleave2.ll
delete mode 100644 llvm/test/Transforms/Scalarizer/sincos.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9048481b49189..00dbbc757f156 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -126,6 +126,7 @@ class IntrinsicCostAttributes {
// If ScalarizationCost is UINT_MAX, the cost of scalarizing the
// arguments and the return value will be computed based on types.
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
+ TargetLibraryInfo const *LibInfo = nullptr;
public:
IntrinsicCostAttributes(
@@ -145,7 +146,8 @@ class IntrinsicCostAttributes {
Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
const IntrinsicInst *I = nullptr,
- InstructionCost ScalarCost = InstructionCost::getInvalid());
+ InstructionCost ScalarCost = InstructionCost::getInvalid(),
+ TargetLibraryInfo const *LibInfo = nullptr);
Intrinsic::ID getID() const { return IID; }
const IntrinsicInst *getInst() const { return II; }
@@ -154,6 +156,7 @@ class IntrinsicCostAttributes {
InstructionCost getScalarizationCost() const { return ScalarizationCost; }
const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }
+ const TargetLibraryInfo *getLibInfo() const { return LibInfo; }
bool isTypeBasedOnly() const {
return Arguments.empty();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 032c7d7b5159e..d633b26811cf5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -22,6 +22,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/TargetTransformInfoImpl.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -1726,9 +1727,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
Type *RetTy = ICA.getReturnType();
- ElementCount RetVF =
- (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
- : ElementCount::getFixed(1));
+ ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy)
+ : ElementCount::getFixed(1);
+
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
@@ -1997,6 +1998,49 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
case Intrinsic::experimental_vector_match:
return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+ case Intrinsic::sincos: {
+ // Vector variants of llvm.sincos can be mapped to a vector library call.
+ auto const *LibInfo = ICA.getLibInfo();
+ if (!LibInfo || !isVectorizedTy(RetTy))
+ break;
+
+ // Find associated libcall.
+ VectorType *VectorTy = cast<VectorType>(getContainedTypes(RetTy).front());
+ EVT VT = getTLI()->getValueType(DL, VectorTy);
+ RTLIB::Libcall LC = RTLIB::getSINCOS(VT.getVectorElementType());
+ const char *LCName = getTLI()->getLibcallName(LC);
+ if (!LC || !LCName)
+ break;
+
+ // Search for a corresponding vector variant.
+ LLVMContext &Ctx = RetTy->getContext();
+ auto VF = getVectorizedTypeVF(RetTy);
+ VecDesc const *VD = nullptr;
+ for (bool Masked : {false, true}) {
+ if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
+ break;
+ }
+ if (!VD)
+ break;
+
+ // Cost the call + mask.
+ auto Cost = thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(),
+ CostKind);
+ if (VD->isMasked())
+ Cost += thisT()->getShuffleCost(
+ TargetTransformInfo::SK_Broadcast,
+ VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0,
+ nullptr, {});
+
+ // Lowering to a sincos library call (with output pointers) may require us
+ // to emit reloads for the results.
+ Cost +=
+ thisT()->getMemoryOpCost(
+ Instruction::Load, VectorTy,
+ thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind) *
+ 2;
+ return Cost;
+ }
}
// Assume that we need to scalarize this intrinsic.)
@@ -2005,10 +2049,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
if (RetVF.isVector() && !RetVF.isScalable()) {
ScalarizationCost = 0;
- if (!RetTy->isVoidTy())
- ScalarizationCost += getScalarizationOverhead(
- cast<VectorType>(RetTy),
- /*Insert*/ true, /*Extract*/ false, CostKind);
+ if (!RetTy->isVoidTy()) {
+ for (Type *VectorTy : getContainedTypes(RetTy)) {
+ ScalarizationCost += getScalarizationOverhead(
+ cast<VectorType>(VectorTy),
+ /*Insert*/ true, /*Extract*/ false, CostKind);
+ }
+ }
ScalarizationCost +=
getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
}
@@ -2689,27 +2736,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// Else, assume that we need to scalarize this intrinsic. For math builtins
// this will emit a costly libcall, adding call overhead and spills. Make it
// very expensive.
- if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
+ if (isVectorizedTy(RetTy)) {
+ ArrayRef<Type *> RetVTys = getContainedTypes(RetTy);
+
// Scalable vectors cannot be scalarized, so return Invalid.
- if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
- return isa<ScalableVectorType>(Ty);
- }))
+ if (any_of(concat<Type *const>(RetVTys, Tys),
+ [](Type *Ty) { return isa<ScalableVectorType>(Ty); }))
return InstructionCost::getInvalid();
- InstructionCost ScalarizationCost =
- SkipScalarizationCost
- ? ScalarizationCostPassed
- : getScalarizationOverhead(RetVTy, /*Insert*/ true,
- /*Extract*/ false, CostKind);
+ InstructionCost ScalarizationCost = ScalarizationCostPassed;
+ if (!SkipScalarizationCost) {
+ ScalarizationCost = 0;
+ for (Type *RetVTy : RetVTys) {
+ ScalarizationCost += getScalarizationOverhead(
+ cast<VectorType>(RetVTy), /*Insert*/ true,
+ /*Extract*/ false, CostKind);
+ }
+ }
- unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
+ unsigned ScalarCalls = getVectorizedTypeVF(RetTy).getFixedValue();
SmallVector<Type *, 4> ScalarTys;
for (Type *Ty : Tys) {
if (Ty->isVectorTy())
Ty = Ty->getScalarType();
ScalarTys.push_back(Ty);
}
- IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
+ IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF);
InstructionCost ScalarCost =
thisT()->getIntrinsicInstrCost(Attrs, CostKind);
for (Type *Ty : Tys) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1ca9a16b18112..ed041dc3c8bfb 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -101,13 +101,12 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
ParamTys.push_back(Argument->getType());
}
-IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
- ArrayRef<const Value *> Args,
- ArrayRef<Type *> Tys,
- FastMathFlags Flags,
- const IntrinsicInst *I,
- InstructionCost ScalarCost)
- : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+IntrinsicCostAttributes::IntrinsicCostAttributes(
+ Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
+ ArrayRef<Type *> Tys, FastMathFlags Flags, const IntrinsicInst *I,
+ InstructionCost ScalarCost, TargetLibraryInfo const *LibInfo)
+ : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost),
+ LibInfo(LibInfo) {
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 53be7fc0bee9f..dcfd3d5a8bd6e 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -72,6 +72,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::atan2:
case Intrinsic::sin:
case Intrinsic::cos:
+ case Intrinsic::sincos:
case Intrinsic::tan:
case Intrinsic::sinh:
case Intrinsic::cosh:
@@ -185,6 +186,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::ucmp:
case Intrinsic::scmp:
return OpdIdx == -1 || OpdIdx == 0;
+ case Intrinsic::sincos:
case Intrinsic::is_fpclass:
case Intrinsic::vp_is_fpclass:
return OpdIdx == 0;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8a5db28ea0a4..9638c5f68e24b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2926,7 +2926,8 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
[&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast<IntrinsicInst>(CI));
+ dyn_cast<IntrinsicInst>(CI),
+ InstructionCost::getInvalid(), TLI);
return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d57a6c481748c..06ff8aeb8922d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1179,7 +1179,8 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
IntrinsicCostAttributes CostAttrs(
VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
+ dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
+ InstructionCost::getInvalid(), &Ctx.TLI);
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
index 22134d042fabb..f5f4445b34b02 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
@@ -68,46 +68,46 @@ define void @frexp_f16_i32() {
define void @frexp_f16_i16() {
; GFX7-LABEL: 'frexp_f16_i16'
; GFX7-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX7-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
; GFX7-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX8PLUS-LABEL: 'frexp_f16_i16'
; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
; GFX8PLUS-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; GFX7-SIZE-LABEL: 'frexp_f16_i16'
; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
; GFX7-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i16'
; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
-; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
; GFX8PLUS-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/llvm.sincos.ll b/llvm/test/Transforms/LoopVectorize/AArch64/llvm.sincos.ll
new file mode 100644
index 0000000000000..9c2b3a2f57f9c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/llvm.sincos.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|sincos|extractvalue|store)" --version 5
+; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve < %s -S -o - -debug-only=loop-vectorize 2>%t.1 | FileCheck %s --check-prefix=CHECK
+; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve -vector-library=ArmPL < %s -S -o - -debug-only=loop-vectorize 2>%t.2 | FileCheck %s --check-prefix=CHECK-ARMPL
+; RUN: cat %t.1 | FileCheck --check-prefix=CHECK-COST %s
+; RUN: cat %t.2 | FileCheck --check-prefix=CHECK-COST-ARMPL %s
+; REQUIRES: asserts
+
+; CHECK-COST-LABEL: sincos_f32
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of 58 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+
+; CHECK-COST-ARMPL-LABEL: sincos_f32
+; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+; CHECK-COST-ARMPL: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 12 for VF 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 13 for VF vscale x 4: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+
+define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @sincos_f32(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK: [[ENTRY:.*:]]
+; CHECK: [[VECTOR_PH:.*:]]
+; CHECK: [[VECTOR_BODY:.*:]]
+; CHECK: [[TMP3:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK: [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 0
+; CHECK: [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
+; CHECK: store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
+; CHECK: store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
+; CHECK: [[MIDDLE_BLOCK:.*:]]
+; CHECK: [[SCALAR_PH:.*:]]
+; CHECK: [[FOR_BODY:.*:]]
+; CHECK: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
+; CHECK: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
+; CHECK: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
+; CHECK: store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
+; CHECK: store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
+; CHECK: [[EXIT:.*:]]
+;
+; CHECK-ARMPL-LABEL: define void @sincos_f32(
+; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-ARMPL: [[ENTRY:.*:]]
+; CHECK-ARMPL: [[VECTOR_PH:.*:]]
+; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
+; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
+; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
+; CHECK-ARMPL: [[TMP15:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP13]], 0
+; CHECK-ARMPL: [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 1
+; CHECK-ARMPL: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP13]], 1
+; CHECK-ARMPL: store <vscale x 4 x float> [[TMP14]], ptr [[TMP19:%.*]], align 4
+; CHECK-ARMPL: store <vscale x 4 x float> [[TMP15]], ptr [[TMP22:%.*]], align 4
+; CHECK-ARMPL: store <vscale x 4 x float> [[TMP16]], ptr [[TMP24:%.*]], align 4
+; CHECK-ARMPL: store <vscale x 4 x float> [[TMP17]], ptr [[TMP27:%.*]], align 4
+; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL: [[SCALAR_PH:.*:]]
+; CHECK-ARMPL: [[FOR_BODY:.*:]]
+; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
+; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
+; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
+; CHECK-ARMPL: store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
+; CHECK-ARMPL: store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
+; CHECK-ARMPL: [[EXIT:.*:]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+ %in_val = load float, ptr %arrayidx, align 4
+ %call = tail call { float, float } @llvm.sincos.f32(float %in_val)
+ %extract_a = extractvalue { float, float } %call, 0
+ %extract_b = extractvalue { float, float } %call, 1
+ %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+ store float %extract_a, ptr %arrayidx2, align 4
+ %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+ store float %extract_b, ptr %arrayidx4, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+; CHECK-COST-LABEL: sincos_f64
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { double, double } @llvm.sincos.f64(double %in_val)
+; CHECK-COST: Cost of 26 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST: Cost of Invalid for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+
+; CHECK-COST-ARMPL-LABEL: sincos_f64
+; CHECK-COST-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %call = tail call { double, double } @llvm.sincos.f64(double %in_val)
+; CHECK-COST-ARMPL: Cost of 12 for VF 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of Invalid for VF vscale x 1: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+; CHECK-COST-ARMPL: Cost of 13 for VF vscale x 2: WIDEN-INTRINSIC ir<%call> = call llvm.sincos(ir<%in_val>)
+
+define void @sincos_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @sincos_f64(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] {
+; CHECK: [[ENTRY:.*:]]
+; CHECK: [[VECTOR_PH:.*:]]
+; CHECK: [[VECTOR_BODY:.*:]]
+; CHECK: [[TMP3:%.*]] = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+; CHECK: [[TMP4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 0
+; CHECK: [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
+; CHECK: store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
+; CHECK: store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
+; CHECK: [[MIDDLE_BLOCK:.*:]]
+; CHECK: [[SCALAR_PH:.*:]]
+; CHECK: [[FOR_BODY:.*:]]
+; CHECK: [[CALL:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[IN_VAL:%.*]])
+; CHECK: [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
+; CHECK: [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
+; CHECK: store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
+; CHECK: store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
+; CHECK: [[EXIT:.*:]]
+;
+; CHECK-ARMPL-LABEL: define void @sincos_f64(
+; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] {
+; CHECK-ARMPL: [[ENTRY:.*:]]
+; CHECK-ARMPL: [[VECTOR_PH:.*:]]
+; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD:%.*]])
+; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD1:%.*]])
+; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP12]], 0
+; CHECK-ARMPL: [[TMP15:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP13]], 0
+; CHECK-ARMPL: [[TMP16:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP12]], 1
+; CHECK-ARMPL: [[TMP17:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP13]], 1
+; CHECK-ARMPL: store <vscale x 2 x double> [[TMP14]], ptr [[TMP19:%.*]], align 8
+; CHECK-ARMPL: store <vscale x 2 x double> [[TMP15]], ptr [[TMP22:%.*]], align 8
+; CHECK-ARMPL: store <vscale x 2 x double> [[TMP16]], ptr [[TMP24:%.*]], align 8
+; CHECK-ARMPL: store <vscale x 2 x double> [[TMP17]], ptr [[TMP27:%.*]], align 8
+; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL: [[SCALAR_PH:.*:]]
+; CHECK-ARMPL: [[FOR_BODY:.*:]]
+; CHECK-ARMPL: [[CALL:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[IN_VAL:%.*]])
+; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
+; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
+; CHECK-ARMPL: store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
+; CHECK-ARMPL: store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
+; CHECK-ARMPL: [[EXIT:.*:]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv
+ %in_val = load double, ptr %arrayidx, align 8
+ %call = tail call { double, double } @llvm.sincos.f64(double %in_val)
+ %extract_a = extractvalue { double, double } %call, 0
+ %extract_b = extractvalue { double, double } %call, 1
+ %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv
+ store double %extract_a, ptr %arrayidx2, align 8
+ %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv
+ store double %extract_b, ptr %arrayidx4, align 8
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/Scalarizer/deinterleave2.ll b/llvm/test/Transforms/Scalarizer/deinterleave2.ll
new file mode 100644
index 0000000000000..bbc7c726857df
--- /dev/null
+++ b/llvm/test/Transforms/Scalarizer/deinterleave2.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt %s -passes="function(scalarizer)" -S | FileCheck %s
+
+; Test to make sure that struct return intrinsics that are not `isTriviallyScalarizable` do not get scalarized.
+
+define <2 x float> @test_(<4 x float> %Val) {
+; CHECK-LABEL: define <2 x float> @test_(
+; CHECK-SAME: <4 x float> [[VAL:%.*]]) {
+; CHECK-NEXT: [[R:%.*]] = call { <2 x float>, <2 x float> } @llvm.vector.deinterleave2.v4f32(<4 x float> [[VAL]])
+; CHECK-NEXT: [[EL:%.*]] = extractvalue { <2 x float>, <2 x float> } [[R]], 0
+; CHECK-NEXT: ret <2 x float> [[EL]]
+;
+ %r = call { <2 x float>, <2 x float> } @llvm.vector.deinterleave2.v4f32(<4 x float> %Val)
+ %el = extractvalue { <2 x float>, <2 x float> } %r, 0
+ ret <2 x float> %el
+}
+
diff --git a/llvm/test/Transforms/Scalarizer/sincos.ll b/llvm/test/Transforms/Scalarizer/sincos.ll
deleted file mode 100644
index 8db4ba3183290..0000000000000
--- a/llvm/test/Transforms/Scalarizer/sincos.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt %s -passes="function(scalarizer)" -S | FileCheck %s
-
-; Test to make sure that struct return intrinsics that are not `isTriviallyScalarizable` do not get scalarized.
-
-define <4 x float> @test_(<4 x float> %Val) {
-; CHECK-LABEL: define <4 x float> @test_(
-; CHECK-SAME: <4 x float> [[VAL:%.*]]) {
-; CHECK-NEXT: [[R:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> [[VAL]])
-; CHECK-NEXT: [[EL:%.*]] = extractvalue { <4 x float>, <4 x float> } [[R]], 0
-; CHECK-NEXT: ret <4 x float> [[EL]]
-;
- %r = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %Val)
- %el = extractvalue { <4 x float>, <4 x float> } %r, 0
- ret <4 x float> %el
-}
-
>From 348fb3ad199cb62aeaadc4f5ade020362585cf45 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 17 Feb 2025 17:12:36 +0000
Subject: [PATCH 2/3] Factor out (private) helper
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 107 ++++++++++++++---------
1 file changed, 66 insertions(+), 41 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index d633b26811cf5..8e4111fb61dff 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -286,6 +286,64 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return false;
}
+ /// Several intrinsics struct-ret (including llvm.sincos[pi] and llvm.modf)
+ /// can be lowered to a vector library call (for certain VFs). The vector
+ /// library functions correspond to the scalar calls (e.g. sincos or modf),
+ /// which unlike the intrinsic return values via output pointers. This helper
+ /// checks if a vector call exists for the given intrinsic, and returns the
+ /// cost, which includes the cost of the mask (if required), and the loads for
+ /// values returned via output pointers. \p LC is the scalar libcall and
+ /// \p CallRetElementIndex (optional) is the struct element which is mapped to
+ /// the call return value. If std::nullopt is returned, the no vector library
+ /// call is available, so the intrinsic should be assigned the default cost
+ /// (e.g. scalarization).
+ std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost(
+ const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind,
+ RTLIB::Libcall LC, std::optional<unsigned> CallRetElementIndex = {}) {
+ Type *RetTy = ICA.getReturnType();
+ // Vector variants of the intrinsic can be mapped to a vector library call.
+ auto const *LibInfo = ICA.getLibInfo();
+ if (!LibInfo || !isa<StructType>(RetTy) ||
+ !isVectorizedStructTy(cast<StructType>(RetTy)))
+ return std::nullopt;
+
+ // Find associated libcall.
+ const char *LCName = getTLI()->getLibcallName(LC);
+ if (!LC || !LCName)
+ return std::nullopt;
+
+ // Search for a corresponding vector variant.
+ LLVMContext &Ctx = RetTy->getContext();
+ ElementCount VF = getVectorizedTypeVF(RetTy);
+ VecDesc const *VD = nullptr;
+ for (bool Masked : {false, true}) {
+ if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
+ break;
+ }
+ if (!VD)
+ return std::nullopt;
+
+ // Cost the call + mask.
+ auto Cost =
+ thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
+ if (VD->isMasked())
+ Cost += thisT()->getShuffleCost(
+ TargetTransformInfo::SK_Broadcast,
+ VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0,
+ nullptr, {});
+
+ // Lowering to a library call (with output pointers) may require us to emit
+ // reloads for the results.
+ for (auto [Idx, VectorTy] : enumerate(getContainedTypes(RetTy))) {
+ if (Idx == CallRetElementIndex)
+ continue;
+ Cost += thisT()->getMemoryOpCost(
+ Instruction::Load, VectorTy,
+ thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind);
+ }
+ return Cost;
+ }
+
protected:
explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
: BaseT(DL) {}
@@ -1999,47 +2057,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
case Intrinsic::experimental_vector_match:
return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
case Intrinsic::sincos: {
- // Vector variants of llvm.sincos can be mapped to a vector library call.
- auto const *LibInfo = ICA.getLibInfo();
- if (!LibInfo || !isVectorizedTy(RetTy))
- break;
-
- // Find associated libcall.
- VectorType *VectorTy = cast<VectorType>(getContainedTypes(RetTy).front());
- EVT VT = getTLI()->getValueType(DL, VectorTy);
- RTLIB::Libcall LC = RTLIB::getSINCOS(VT.getVectorElementType());
- const char *LCName = getTLI()->getLibcallName(LC);
- if (!LC || !LCName)
- break;
-
- // Search for a corresponding vector variant.
- LLVMContext &Ctx = RetTy->getContext();
- auto VF = getVectorizedTypeVF(RetTy);
- VecDesc const *VD = nullptr;
- for (bool Masked : {false, true}) {
- if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
- break;
- }
- if (!VD)
- break;
-
- // Cost the call + mask.
- auto Cost = thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(),
- CostKind);
- if (VD->isMasked())
- Cost += thisT()->getShuffleCost(
- TargetTransformInfo::SK_Broadcast,
- VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0,
- nullptr, {});
-
- // Lowering to a sincos library call (with output pointers) may require us
- // to emit reloads for the results.
- Cost +=
- thisT()->getMemoryOpCost(
- Instruction::Load, VectorTy,
- thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind) *
- 2;
- return Cost;
+ Type *Ty = getContainedTypes(RetTy).front();
+ EVT VT = getTLI()->getValueType(DL, Ty);
+ RTLIB::Libcall LC = RTLIB::getSINCOS(VT.getScalarType());
+ if (auto Cost =
+ getMultipleResultIntrinsicVectorLibCallCost(ICA, CostKind, LC))
+ return *Cost;
+ // Otherwise, fallback to default scalarization cost.
+ break;
}
}
>From 3a8249b446574f8127bde9f7a99a6949aa3b781a Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 19 Feb 2025 18:03:43 +0000
Subject: [PATCH 3/3] Add standalone cost model tests
---
.../llvm/Analysis/TargetTransformInfo.h | 2 +-
llvm/lib/Analysis/CostModel.cpp | 13 ++--
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +-
.../test/Analysis/CostModel/AArch64/sincos.ll | 67 +++++++++++++++++++
4 files changed, 78 insertions(+), 8 deletions(-)
create mode 100644 llvm/test/Analysis/CostModel/AArch64/sincos.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 00dbbc757f156..9cd1031143151 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -132,7 +132,7 @@ class IntrinsicCostAttributes {
IntrinsicCostAttributes(
Intrinsic::ID Id, const CallBase &CI,
InstructionCost ScalarCost = InstructionCost::getInvalid(),
- bool TypeBasedOnly = false);
+ bool TypeBasedOnly = false, TargetLibraryInfo const *LibInfo = nullptr);
IntrinsicCostAttributes(
Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp
index ee6622516a5ac..f476489722bf0 100644
--- a/llvm/lib/Analysis/CostModel.cpp
+++ b/llvm/lib/Analysis/CostModel.cpp
@@ -17,6 +17,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/CostModel.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -24,6 +25,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
+
using namespace llvm;
static cl::opt<TargetTransformInfo::TargetCostKind> CostKind(
@@ -48,6 +50,7 @@ static cl::opt<bool> TypeBasedIntrinsicCost("type-based-intrinsic-cost",
PreservedAnalyses CostModelPrinterPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
OS << "Printing analysis 'Cost Model Analysis' for function '" << F.getName() << "':\n";
for (BasicBlock &B : F) {
for (Instruction &Inst : B) {
@@ -55,12 +58,12 @@ PreservedAnalyses CostModelPrinterPass::run(Function &F,
// which cost kind to print.
InstructionCost Cost;
auto *II = dyn_cast<IntrinsicInst>(&Inst);
- if (II && TypeBasedIntrinsicCost) {
- IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II,
- InstructionCost::getInvalid(), true);
+ if (II) {
+ IntrinsicCostAttributes ICA(
+ II->getIntrinsicID(), *II, InstructionCost::getInvalid(),
+ /*TypeBasedOnly=*/TypeBasedIntrinsicCost, &TLI);
Cost = TTI.getIntrinsicInstrCost(ICA, CostKind);
- }
- else {
+ } else {
Cost = TTI.getInstructionCost(&Inst, CostKind);
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index ed041dc3c8bfb..9b3ed46853626 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -69,9 +69,9 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) {
IntrinsicCostAttributes::IntrinsicCostAttributes(
Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarizationCost,
- bool TypeBasedOnly)
+ bool TypeBasedOnly, TargetLibraryInfo const *LibInfo)
: II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id),
- ScalarizationCost(ScalarizationCost) {
+ ScalarizationCost(ScalarizationCost), LibInfo(LibInfo) {
if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
FMF = FPMO->getFastMathFlags();
diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
new file mode 100644
index 0000000000000..8adcad904d883
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos"
+; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
+; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB
+
+define void @sincos(
+; CHECK-LABEL: 'sincos'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half %x_f16)
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float %x_f32)
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double %x_f64)
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 %x_f128)
+;
+; CHECK: Cost Model: Found an estimated cost of 50 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> %x_v8xf16)
+; CHECK: Cost Model: Found an estimated cost of 58 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x_v4xf32)
+; CHECK: Cost Model: Found an estimated cost of 26 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x_v2xf64)
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> %x_v1xf128)
+;
+; CHECK: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> %x_nxv8xf16)
+; CHECK: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> %x_nxv4xf32)
+; CHECK: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> %x_nxv2xf64)
+; CHECK: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> %x_nxv1xf128)
+
+; CHECK-VECLIB-LABEL: 'sincos'
+; CHECK-VECLIB: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half %x_f16)
+; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float %x_f32)
+; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double %x_f64)
+; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 %x_f128)
+;
+; CHECK-VECLIB: Cost Model: Found an estimated cost of 50 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> %x_v8xf16)
+; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x_v4xf32)
+; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x_v2xf64)
+; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> %x_v1xf128)
+;
+; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> %x_nxv8xf16)
+; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> %x_nxv4xf32)
+; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> %x_nxv2xf64)
+; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> %x_nxv1xf128)
+
+ half %x_f16,
+ float %x_f32,
+ double %x_f64,
+ fp128 %x_f128,
+ <8 x half> %x_v8xf16,
+ <4 x float> %x_v4xf32,
+ <2 x double> %x_v2xf64,
+ <1 x fp128> %x_v1xf128,
+ <vscale x 8 x half> %x_nxv8xf16,
+ <vscale x 4 x float> %x_nxv4xf32,
+ <vscale x 2 x double> %x_nxv2xf64,
+ <vscale x 1 x fp128> %x_nxv1xf128
+) {
+ %f16 = call { half, half } @llvm.sincos.f16(half %x_f16)
+ %f32 = call { float, float } @llvm.sincos.f32(float %x_f32)
+ %f64 = call { double, double } @llvm.sincos.f64(double %x_f64)
+ %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 %x_f128)
+
+ %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> %x_v8xf16)
+ %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x_v4xf32)
+ %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x_v2xf64)
+ %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> %x_v1xf128)
+
+ %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.v8f16(<vscale x 8 x half> %x_nxv8xf16)
+ %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.v4f32(<vscale x 4 x float> %x_nxv4xf32)
+ %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.v2f64(<vscale x 2 x double> %x_nxv2xf64)
+ %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.v1f128(<vscale x 1 x fp128> %x_nxv1xf128)
+
+ ret void
+}
More information about the llvm-commits
mailing list