[llvm] d827588 - [VectorCombine] Scalarize binop-like intrinsics (#138095)

Wed May 21 01:24:15 PDT 2025

Author: Luke Lau
Date: 2025-05-21T09:24:11+01:00
New Revision: d827588c36131c342fe3602a2876b40efc39638a

URL: https://github.com/llvm/llvm-project/commit/d827588c36131c342fe3602a2876b40efc39638a
DIFF: https://github.com/llvm/llvm-project/commit/d827588c36131c342fe3602a2876b40efc39638a.diff

LOG: [VectorCombine] Scalarize binop-like intrinsics (#138095)

Currently VectorCombine can scalarize vector compares and binary ops.
This extends it to also scalarize binary-op like intrinsics like umax,
minnum etc.

The motivation behind this is to scalarize more intrinsics in
VectorCombine rather than in DAGCombine, so we can sink splats across
basic blocks: see #137786

This currently has very little effect on generated code because
InstCombine doesn't yet canonicalize binary intrinsics where one operand
is a constant into the form that VectorCombine expects, i.e. `binop
(shuffle insert) const --> shuffle (binop insert const)`. The plan is to
land this first and then in a subsequent patch teach InstCombine to do
the canonicalization to avoid regressions in the meantime.

This uses `isTriviallyVectorizable` to determine whether or not an
intrinsic is safe to scalarize. There's also `isTriviallyScalarizable`,
but this seems more geared towards the Scalarizer pass and includes
intrinsics with multiple return values.

It also only handles intrinsics with two operands with the same type as
the return type. In the future we would generalize this to handle
arbitrary numbers of operands, including unary operators too, e.g. fneg
or fma, as well as different operand types, e.g. powi or scmp

Added: 
    llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll
    llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c7d221e8d1e5c..fe1d930f295ce 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -48,6 +48,7 @@ STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
 STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
 STATISTIC(NumScalarBO, "Number of scalar binops formed");
 STATISTIC(NumScalarCmp, "Number of scalar compares formed");
+STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
 
 static cl::opt<bool> DisableVectorCombine(
     "disable-vector-combine", cl::init(false), cl::Hidden,
@@ -1016,21 +1017,34 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
   return true;
 }
 
-/// Match a vector binop or compare instruction with at least one inserted
-/// scalar operand and convert to scalar binop/cmp followed by insertelement.
+/// Match a vector binop, compare or binop-like intrinsic with at least one
+/// inserted scalar operand and convert to scalar binop/cmp/intrinsic followed
+/// by insertelement.
 bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
   Value *Ins0, *Ins1;
   if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
-      !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
-    return false;
+      !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
+    // TODO: Allow unary and ternary intrinsics
+    // TODO: Allow intrinsics with 
diff erent argument types
+    // TODO: Allow intrinsics with scalar arguments
+    if (auto *II = dyn_cast<IntrinsicInst>(&I);
+        II && II->arg_size() == 2 &&
+        isTriviallyVectorizable(II->getIntrinsicID()) &&
+        all_of(II->args(),
+               [&II](Value *Arg) { return Arg->getType() == II->getType(); })) {
+      Ins0 = II->getArgOperand(0);
+      Ins1 = II->getArgOperand(1);
+    } else {
+      return false;
+    }
+  }
 
   // Do not convert the vector condition of a vector select into a scalar
   // condition. That may cause problems for codegen because of 
diff erences in
   // boolean formats and register-file transfers.
   // TODO: Can we account for that in the cost model?
-  bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
-  if (IsCmp)
+  if (isa<CmpInst>(I))
     for (User *U : I.users())
       if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
         return false;
@@ -1085,15 +1099,25 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   unsigned Opcode = I.getOpcode();
   InstructionCost ScalarOpCost, VectorOpCost;
-  if (IsCmp) {
+  if (isa<CmpInst>(I)) {
     CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
     ScalarOpCost = TTI.getCmpSelInstrCost(
         Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
     VectorOpCost = TTI.getCmpSelInstrCost(
         Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
-  } else {
+  } else if (isa<BinaryOperator>(I)) {
     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
+  } else {
+    auto *II = cast<IntrinsicInst>(&I);
+    IntrinsicCostAttributes ScalarICA(
+        II->getIntrinsicID(), ScalarTy,
+        SmallVector<Type *>(II->arg_size(), ScalarTy));
+    ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
+    IntrinsicCostAttributes VectorICA(
+        II->getIntrinsicID(), VecTy,
+        SmallVector<Type *>(II->arg_size(), VecTy));
+    VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
   }
 
   // Get cost estimate for the insert element. This cost will factor into
@@ -1112,10 +1136,12 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
   // inselt NewVecC, (scalar_op V0, V1), Index
-  if (IsCmp)
+  if (isa<CmpInst>(I))
     ++NumScalarCmp;
-  else
+  else if (isa<BinaryOperator>(I))
     ++NumScalarBO;
+  else if (isa<IntrinsicInst>(I))
+    ++NumScalarIntrinsic;
 
   // For constant cases, extract the scalar element, this should constant fold.
   if (IsConst0)
@@ -1123,9 +1149,14 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   if (IsConst1)
     V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
 
-  Value *Scalar =
-      IsCmp ? Builder.CreateCmp(Pred, V0, V1)
-            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+  Value *Scalar;
+  if (isa<CmpInst>(I))
+    Scalar = Builder.CreateCmp(Pred, V0, V1);
+  else if (isa<BinaryOperator>(I))
+    Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+  else
+    Scalar = Builder.CreateIntrinsic(
+        ScalarTy, cast<IntrinsicInst>(I).getIntrinsicID(), {V0, V1});
 
   Scalar->setName(I.getName() + ".scalar");
 
@@ -1135,9 +1166,14 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
     ScalarInst->copyIRFlags(&I);
 
   // Fold the vector constants in the original vectors into a new base vector.
-  Value *NewVecC =
-      IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
-            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+  Value *NewVecC;
+  if (isa<CmpInst>(I))
+    NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1);
+  else if (isa<BinaryOperator>(I))
+    NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+  else
+    NewVecC = Builder.CreateIntrinsic(
+        VecTy, cast<IntrinsicInst>(I).getIntrinsicID(), {VecC0, VecC1});
   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
   replaceValue(I, *Insert);
   return true;

diff  --git a/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll
new file mode 100644
index 0000000000000..5f3229398792a
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -p vector-combine -mtriple=x86_64 -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -S -p vector-combine -mtriple=x86_64 -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+
+define <2 x float> @maxnum(float %x, float %y) {
+; SSE2-LABEL: define <2 x float> @maxnum(
+; SSE2-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE2-NEXT:    [[X_INSERT:%.*]] = insertelement <2 x float> poison, float [[X]], i32 0
+; SSE2-NEXT:    [[Y_INSERT:%.*]] = insertelement <2 x float> poison, float [[Y]], i32 0
+; SSE2-NEXT:    [[V:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[X_INSERT]], <2 x float> [[Y_INSERT]])
+; SSE2-NEXT:    ret <2 x float> [[V]]
+;
+; AVX2-LABEL: define <2 x float> @maxnum(
+; AVX2-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX2-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]])
+; AVX2-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> poison, <2 x float> poison)
+; AVX2-NEXT:    [[V:%.*]] = insertelement <2 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; AVX2-NEXT:    ret <2 x float> [[V]]
+;
+  %x.insert = insertelement <2 x float> poison, float %x, i32 0
+  %y.insert = insertelement <2 x float> poison, float %y, i32 0
+  %v = call <2 x float> @llvm.maxnum(<2 x float> %x.insert, <2 x float> %y.insert)
+  ret <2 x float> %v
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

diff  --git a/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
new file mode 100644
index 0000000000000..e7683d72a052d
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -p vector-combine | FileCheck %s
+
+define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %y.insert = insertelement <4 x i32> poison, i32 %y, i32 0
+  %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> %y.insert)
+  ret <4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %y.insert = insertelement <vscale x 4 x i32> poison, i32 %y, i32 0
+  %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> %y.insert)
+  ret <vscale x 4 x i32> %v
+}
+
+define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 1, i32 [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %v = call <4 x i32> @llvm.umax(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> %x.insert)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+  ret <4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_lhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 42, i32 [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> %x.insert)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable_rhs_const(i32 %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_rhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 42)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> splat (i32 42))
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> splat (i32 42))
+  ret <vscale x 4 x i32> %v
+}
+
+; Shouldn't be scalarized, not a "trivially vectorizable" intrinsic.
+define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @non_trivially_vectorizable(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[Y_INSERT:%.*]] = insertelement <8 x i32> poison, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[X_INSERT]], <8 x i32> [[Y_INSERT]])
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %y.insert = insertelement <8 x i32> poison, i32 %y, i32 0
+  %v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert)
+  ret <4 x i32> %v
+}
+
+; TODO: We should be able to scalarize this if we preserve the scalar argument.
+define <4 x float> @scalar_argument(float %x) {
+; CHECK-LABEL: define <4 x float> @scalar_argument(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[X_INSERT]], i32 42)
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+  %x.insert = insertelement <4 x float> poison, float %x, i32 0
+  %v = call <4 x float> @llvm.powi(<4 x float> %x.insert, i32 42)
+  ret <4 x float> %v
+}
+
+define <4 x i2> @scmp(i32 %x) {
+; CHECK-LABEL: define <4 x i2> @scmp(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x i2> @llvm.scmp.v4i2.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <4 x i2> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %v = call <4 x i2> @llvm.scmp(<4 x i32> %x.insert, <4 x i32> splat (i32 0))
+  ret <4 x i2> %v
+}