[llvm] [VectorCombine] Scalarize binop-like intrinsics (PR #138095)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 1 00:47:59 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Luke Lau (lukel97)
<details>
<summary>Changes</summary>
Currently VectorCombine can scalarize vector compares and binary ops. This extends it to also scalarize binary-op like intrinsics like umax, minnum etc.
The motivation behind this is to scalarize more intrinsics in VectorCombine rather than in DAGCombine, so we can sink splats across basic blocks: see #<!-- -->137786
This currently has very little effect on generated code because InstCombine doesn't yet canonicalize binary intrinsics where one operand is a constant into the form that VectorCombine expects, i.e. `binop (shuffle insert) const --> shuffle (binop insert const)`. The plan is to land this first and then in a subsequent patch teach InstCombine to do the canonicalization to avoid regressions in the meantime.
This uses `isTriviallyVectorizable` to determine whether or not an intrinsic is safe to scalarize. There's also `isTriviallyScalarizable`, but this seems more geared towards the Scalarizer pass and includes intrinsics with multiple return values.
It also only handles intrinsics with two operands. In the future we would generalize this to handle arbitrary numbers of operands, including unary operators too, e.g. fneg or fma.
---
Full diff: https://github.com/llvm/llvm-project/pull/138095.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+48-16)
- (added) llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll (+97)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 04c084ffdda97..7a7c533267f6f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -48,6 +48,7 @@ STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
STATISTIC(NumScalarBO, "Number of scalar binops formed");
STATISTIC(NumScalarCmp, "Number of scalar compares formed");
+STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
static cl::opt<bool> DisableVectorCombine(
"disable-vector-combine", cl::init(false), cl::Hidden,
@@ -1016,21 +1017,29 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
return true;
}
-/// Match a vector binop or compare instruction with at least one inserted
-/// scalar operand and convert to scalar binop/cmp followed by insertelement.
+/// Match a vector binop, compare or binop-like intrinsic with at least one
+/// inserted scalar operand and convert to scalar binop/cmp/intrinsic followed
+/// by insertelement.
bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
Value *Ins0, *Ins1;
if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
- !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
- return false;
+ !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
+ if (auto *II = dyn_cast<IntrinsicInst>(&I);
+ II && II->arg_size() == 2 &&
+ isTriviallyVectorizable(II->getIntrinsicID())) {
+ Ins0 = II->getArgOperand(0);
+ Ins1 = II->getArgOperand(1);
+ } else {
+ return false;
+ }
+ }
// Do not convert the vector condition of a vector select into a scalar
// condition. That may cause problems for codegen because of differences in
// boolean formats and register-file transfers.
// TODO: Can we account for that in the cost model?
- bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
- if (IsCmp)
+ if (isa<CmpInst>(I))
for (User *U : I.users())
if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
return false;
@@ -1085,15 +1094,24 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
unsigned Opcode = I.getOpcode();
InstructionCost ScalarOpCost, VectorOpCost;
- if (IsCmp) {
+ if (isa<CmpInst>(I)) {
CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
ScalarOpCost = TTI.getCmpSelInstrCost(
Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
VectorOpCost = TTI.getCmpSelInstrCost(
Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
- } else {
+ } else if (isa<BinaryOperator>(I)) {
ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
+ } else if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ IntrinsicCostAttributes ScalarICA(
+ II->getIntrinsicID(), ScalarTy,
+ SmallVector<Type *>(II->arg_size(), ScalarTy));
+ ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
+ IntrinsicCostAttributes VectorICA(
+ II->getIntrinsicID(), VecTy,
+ SmallVector<Type *>(II->arg_size(), VecTy));
+ VectorOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
}
// Get cost estimate for the insert element. This cost will factor into
@@ -1112,10 +1130,12 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
// inselt NewVecC, (scalar_op V0, V1), Index
- if (IsCmp)
+ if (isa<CmpInst>(I))
++NumScalarCmp;
- else
+ else if (isa<BinaryOperator>(I))
++NumScalarBO;
+ else if (isa<IntrinsicInst>(I))
+ ++NumScalarIntrinsic;
// For constant cases, extract the scalar element, this should constant fold.
if (IsConst0)
@@ -1123,9 +1143,15 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
if (IsConst1)
V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
- Value *Scalar =
- IsCmp ? Builder.CreateCmp(Pred, V0, V1)
- : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+ Value *Scalar;
+ if (isa<CmpInst>(I))
+ Scalar = Builder.CreateCmp(Pred, V0, V1);
+ else if (isa<BinaryOperator>(I))
+ Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+ else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), {V0, V1});
+ else
+ llvm_unreachable("Unexpected instruction type");
Scalar->setName(I.getName() + ".scalar");
@@ -1135,9 +1161,15 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
ScalarInst->copyIRFlags(&I);
// Fold the vector constants in the original vectors into a new base vector.
- Value *NewVecC =
- IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
- : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+ Value *NewVecC;
+ if (isa<CmpInst>(I))
+ NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1);
+ else if (isa<BinaryOperator>(I))
+ NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+ else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1});
+ else
+ llvm_unreachable("Unexpected instruction type");
Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
replaceValue(I, *Insert);
return true;
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
new file mode 100644
index 0000000000000..5a25f5faf8911
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -p vector-combine | FileCheck %s
+
+define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> poison)
+; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <4 x i32> [[V]]
+;
+ %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+ %y.insert = insertelement <4 x i32> poison, i32 %y, i32 0
+ %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> %y.insert)
+ ret <4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
+;
+ %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+ %y.insert = insertelement <vscale x 4 x i32> poison, i32 %y, i32 0
+ %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> %y.insert)
+ ret <vscale x 4 x i32> %v
+}
+
+define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 1, i32 [[X]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> poison)
+; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <4 x i32> [[V]]
+;
+ %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+ %v = call <4 x i32> @llvm.umax(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> %x.insert)
+ ret <4 x i32> %v
+}
+
+define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 1)
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+; CHECK-NEXT: [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <4 x i32> [[V]]
+;
+ %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+ %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+ ret <4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_lhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 42, i32 [[X]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
+;
+ %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+ %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> %x.insert)
+ ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable_rhs_const(i32 %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_rhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 42)
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> splat (i32 42))
+; CHECK-NEXT: [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT: ret <vscale x 4 x i32> [[V]]
+;
+ %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+ %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> splat (i32 42))
+ ret <vscale x 4 x i32> %v
+}
+
+; Shouldn't be scalarized, not a "trivially vectorizable" intrinsic.
+define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @non_trivially_vectorizable(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT: [[Y_INSERT:%.*]] = insertelement <8 x i32> poison, i32 [[Y]], i32 0
+; CHECK-NEXT: [[V:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[X_INSERT]], <8 x i32> [[Y_INSERT]])
+; CHECK-NEXT: ret <4 x i32> [[V]]
+;
+ %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+ %y.insert = insertelement <8 x i32> poison, i32 %y, i32 0
+ %v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert)
+ ret <4 x i32> %v
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/138095
More information about the llvm-commits
mailing list