[llvm] [WIP][VectorCombine] Support nary intrinsics in scalarizeBinOpOrCmp (PR #138406)

Sat May 3 08:08:18 PDT 2025

https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/138406

Work in progress support for unary and ternary intrinsics in scalarizeBinOpOrCmp.

Enabling unary operators should be a matter of relaxing the isa<> check after this.

Stacked on #138095 and https://github.com/llvm/llvm-project/pull/137823

>From 1d031ea0234499e5cd42df4933508379a2e286fd Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 1 May 2025 15:32:07 +0800
Subject: [PATCH 01/12] Precommit tests

---
 .../RISCV/intrinsic-scalarize.ll              | 93 +++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll

diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
new file mode 100644
index 0000000000000..55b78c4716bc0
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -p vector-combine | FileCheck %s
+
+define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[Y_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> [[Y_INSERT]])
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %y.insert = insertelement <4 x i32> poison, i32 %y, i32 0
+  %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> %y.insert)
+  ret <4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[Y_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> [[X_INSERT]], <vscale x 4 x i32> [[Y_INSERT]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %y.insert = insertelement <vscale x 4 x i32> poison, i32 %y, i32 0
+  %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> %y.insert)
+  ret <vscale x 4 x i32> %v
+}
+
+define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> [[X_INSERT]])
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %v = call <4 x i32> @llvm.umax(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> %x.insert)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
+; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %v = call <4 x i32> @llvm.umax(<4 x i32> %x.insert, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+  ret <4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_lhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> [[X_INSERT]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> %x.insert)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @umax_scalable_rhs_const(i32 %x) {
+; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_rhs_const(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> [[X_INSERT]], <vscale x 4 x i32> splat (i32 42))
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %v = call <vscale x 4 x i32> @llvm.umax(<vscale x 4 x i32> %x.insert, <vscale x 4 x i32> splat (i32 42))
+  ret <vscale x 4 x i32> %v
+}
+
+; Shouldn't be scalarized, not a "trivially vectorizable" intrinsic.
+define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @non_trivially_vectorizable(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[Y_INSERT:%.*]] = insertelement <8 x i32> poison, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v8i32(<4 x i32> [[X_INSERT]], <8 x i32> [[Y_INSERT]])
+; CHECK-NEXT:    ret <4 x i32> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %y.insert = insertelement <8 x i32> poison, i32 %y, i32 0
+  %v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert)
+  ret <4 x i32> %v
+}

>From ebfcbe452b7657e54c4c4797b452136afb87a9b3 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 1 May 2025 15:32:38 +0800
Subject: [PATCH 02/12] [VectorCombine] Scalarize binop-like intrinsics

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 64 ++++++++++++++-----
 .../RISCV/intrinsic-scalarize.ll              | 32 ++++++----
 2 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 04c084ffdda97..7a7c533267f6f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -48,6 +48,7 @@ STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
 STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
 STATISTIC(NumScalarBO, "Number of scalar binops formed");
 STATISTIC(NumScalarCmp, "Number of scalar compares formed");
+STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
 
 static cl::opt<bool> DisableVectorCombine(
     "disable-vector-combine", cl::init(false), cl::Hidden,
@@ -1016,21 +1017,29 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
   return true;
 }
 
-/// Match a vector binop or compare instruction with at least one inserted
-/// scalar operand and convert to scalar binop/cmp followed by insertelement.
+/// Match a vector binop, compare or binop-like intrinsic with at least one
+/// inserted scalar operand and convert to scalar binop/cmp/intrinsic followed
+/// by insertelement.
 bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
   Value *Ins0, *Ins1;
   if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
-      !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
-    return false;
+      !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
+    if (auto *II = dyn_cast<IntrinsicInst>(&I);
+        II && II->arg_size() == 2 &&
+        isTriviallyVectorizable(II->getIntrinsicID())) {
+      Ins0 = II->getArgOperand(0);
+      Ins1 = II->getArgOperand(1);
+    } else {
+      return false;
+    }
+  }
 
   // Do not convert the vector condition of a vector select into a scalar
   // condition. That may cause problems for codegen because of differences in
   // boolean formats and register-file transfers.
   // TODO: Can we account for that in the cost model?
-  bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
-  if (IsCmp)
+  if (isa<CmpInst>(I))
     for (User *U : I.users())
       if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
         return false;
@@ -1085,15 +1094,24 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   unsigned Opcode = I.getOpcode();
   InstructionCost ScalarOpCost, VectorOpCost;
-  if (IsCmp) {
+  if (isa<CmpInst>(I)) {
     CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
     ScalarOpCost = TTI.getCmpSelInstrCost(
         Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
     VectorOpCost = TTI.getCmpSelInstrCost(
         Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
-  } else {
+  } else if (isa<BinaryOperator>(I)) {
     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
+  } else if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+    IntrinsicCostAttributes ScalarICA(
+        II->getIntrinsicID(), ScalarTy,
+        SmallVector<Type *>(II->arg_size(), ScalarTy));
+    ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
+    IntrinsicCostAttributes VectorICA(
+        II->getIntrinsicID(), VecTy,
+        SmallVector<Type *>(II->arg_size(), VecTy));
+    VectorOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
   }
 
   // Get cost estimate for the insert element. This cost will factor into
@@ -1112,10 +1130,12 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
   // inselt NewVecC, (scalar_op V0, V1), Index
-  if (IsCmp)
+  if (isa<CmpInst>(I))
     ++NumScalarCmp;
-  else
+  else if (isa<BinaryOperator>(I))
     ++NumScalarBO;
+  else if (isa<IntrinsicInst>(I))
+    ++NumScalarIntrinsic;
 
   // For constant cases, extract the scalar element, this should constant fold.
   if (IsConst0)
@@ -1123,9 +1143,15 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   if (IsConst1)
     V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
 
-  Value *Scalar =
-      IsCmp ? Builder.CreateCmp(Pred, V0, V1)
-            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+  Value *Scalar;
+  if (isa<CmpInst>(I))
+    Scalar = Builder.CreateCmp(Pred, V0, V1);
+  else if (isa<BinaryOperator>(I))
+    Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+  else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+    Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), {V0, V1});
+  else
+    llvm_unreachable("Unexpected instruction type");
 
   Scalar->setName(I.getName() + ".scalar");
 
@@ -1135,9 +1161,15 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
     ScalarInst->copyIRFlags(&I);
 
   // Fold the vector constants in the original vectors into a new base vector.
-  Value *NewVecC =
-      IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
-            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+  Value *NewVecC;
+  if (isa<CmpInst>(I))
+    NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1);
+  else if (isa<BinaryOperator>(I))
+    NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+  else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+    NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1});
+  else
+    llvm_unreachable("Unexpected instruction type");
   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
   replaceValue(I, *Insert);
   return true;
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
index 55b78c4716bc0..5a25f5faf8911 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
@@ -4,9 +4,9 @@
 define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
 ; CHECK-LABEL: define <4 x i32> @umax_fixed(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
-; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
-; CHECK-NEXT:    [[Y_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i32 0
-; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> [[Y_INSERT]])
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[V]]
 ;
   %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
@@ -18,9 +18,9 @@ define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
 define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
-; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i32 0
-; CHECK-NEXT:    [[Y_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y]], i32 0
-; CHECK-NEXT:    [[V:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> [[X_INSERT]], <vscale x 4 x i32> [[Y_INSERT]])
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
@@ -32,8 +32,9 @@ define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
 define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
 ; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
-; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
-; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> [[X_INSERT]])
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 1, i32 [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[V]]
 ;
   %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
@@ -44,8 +45,9 @@ define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
 define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
 ; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
-; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
-; CHECK-NEXT:    [[V:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[V]]
 ;
   %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
@@ -56,8 +58,9 @@ define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
 define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_lhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
-; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i32 0
-; CHECK-NEXT:    [[V:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> [[X_INSERT]])
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 42, i32 [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
@@ -68,8 +71,9 @@ define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
 define <vscale x 4 x i32> @umax_scalable_rhs_const(i32 %x) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_rhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
-; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i32 0
-; CHECK-NEXT:    [[V:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> [[X_INSERT]], <vscale x 4 x i32> splat (i32 42))
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 42)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> splat (i32 42))
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0

>From 43743048ab11bfd108b377b8d6ba4f6d55472fd9 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 1 May 2025 15:50:24 +0800
Subject: [PATCH 03/12] clang-format

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 7a7c533267f6f..4f018f5af03a5 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1167,7 +1167,8 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   else if (isa<BinaryOperator>(I))
     NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
   else if (auto *II = dyn_cast<IntrinsicInst>(&I))
-    NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1});
+    NewVecC =
+        Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1});
   else
     llvm_unreachable("Unexpected instruction type");
   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);

>From 968f0613d4960ad4bceb556a3cee90193f28d621 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 1 May 2025 17:21:14 +0800
Subject: [PATCH 04/12] Check isVectorIntrinsicWithScalarOpAtArg

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp     |  6 +++++-
 .../VectorCombine/RISCV/intrinsic-scalarize.ll      | 13 +++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 4f018f5af03a5..345283862ec60 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1027,7 +1027,11 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
       !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
     if (auto *II = dyn_cast<IntrinsicInst>(&I);
         II && II->arg_size() == 2 &&
-        isTriviallyVectorizable(II->getIntrinsicID())) {
+        isTriviallyVectorizable(II->getIntrinsicID()) &&
+        none_of(index_range(0, II->arg_size()), [this, &II](size_t OpIdx) {
+          return isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), OpIdx,
+                                                    &TTI);
+        })) {
       Ins0 = II->getArgOperand(0);
       Ins1 = II->getArgOperand(1);
     } else {
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
index 5a25f5faf8911..e12b1ca99c6d1 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
@@ -95,3 +95,16 @@ define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) {
   %v = call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %x.insert, <8 x i32> %y.insert)
   ret <4 x i32> %v
 }
+
+; TODO: We should be able to scalarize this if we preserve the scalar argument.
+define <4 x float> @scalar_argument(float %x) {
+; CHECK-LABEL: define <4 x float> @scalar_argument(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[X_INSERT]], i32 42)
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+  %x.insert = insertelement <4 x float> poison, float %x, i32 0
+  %v = call <4 x float> @llvm.powi(<4 x float> %x.insert, i32 42)
+  ret <4 x float> %v
+}

>From d559e157d34c3a90921ead867f4f576e5826cb7d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 1 May 2025 17:30:59 +0800
Subject: [PATCH 05/12] Just check all arguments have same type as return

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp      |  9 +++++----
 .../VectorCombine/RISCV/intrinsic-scalarize.ll       | 12 ++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 345283862ec60..57a0ca80361bf 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1025,13 +1025,14 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   Value *Ins0, *Ins1;
   if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
       !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
+    // TODO: Allow unary and ternary intrinsics
+    // TODO: Allow intrinsics with different arguments types
+    // TODO: Allow intrinsics with scalar arguments
     if (auto *II = dyn_cast<IntrinsicInst>(&I);
         II && II->arg_size() == 2 &&
         isTriviallyVectorizable(II->getIntrinsicID()) &&
-        none_of(index_range(0, II->arg_size()), [this, &II](size_t OpIdx) {
-          return isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), OpIdx,
-                                                    &TTI);
-        })) {
+        all_of(II->args(),
+               [&II](Value *Arg) { return Arg->getType() == II->getType(); })) {
       Ins0 = II->getArgOperand(0);
       Ins1 = II->getArgOperand(1);
     } else {
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
index e12b1ca99c6d1..e7683d72a052d 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
@@ -108,3 +108,15 @@ define <4 x float> @scalar_argument(float %x) {
   %v = call <4 x float> @llvm.powi(<4 x float> %x.insert, i32 42)
   ret <4 x float> %v
 }
+
+define <4 x i2> @scmp(i32 %x) {
+; CHECK-LABEL: define <4 x i2> @scmp(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[X_INSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[V:%.*]] = call <4 x i2> @llvm.scmp.v4i2.v4i32(<4 x i32> [[X_INSERT]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <4 x i2> [[V]]
+;
+  %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
+  %v = call <4 x i2> @llvm.scmp(<4 x i32> %x.insert, <4 x i32> splat (i32 0))
+  ret <4 x i2> %v
+}

>From 3c3f7e32184e6c89919a6a59ca9c8ded35bd6b10 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 1 May 2025 17:47:43 +0800
Subject: [PATCH 06/12] Fix vector ICA type, add llvm_unreachable

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 57a0ca80361bf..3d9aac56d959b 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1116,8 +1116,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
     IntrinsicCostAttributes VectorICA(
         II->getIntrinsicID(), VecTy,
         SmallVector<Type *>(II->arg_size(), VecTy));
-    VectorOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
-  }
+    VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
+  } else
+    llvm_unreachable("Unexpected instrucion type");
 
   // Get cost estimate for the insert element. This cost will factor into
   // both sequences.

>From fea2417523b0bc6bf7ddecde509f8258b23e1d72 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 1 May 2025 17:48:57 +0800
Subject: [PATCH 07/12] Fix comment typo

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 3d9aac56d959b..39dd5141b245d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1026,7 +1026,7 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
       !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
     // TODO: Allow unary and ternary intrinsics
-    // TODO: Allow intrinsics with different arguments types
+    // TODO: Allow intrinsics with different argument types
     // TODO: Allow intrinsics with scalar arguments
     if (auto *II = dyn_cast<IntrinsicInst>(&I);
         II && II->arg_size() == 2 &&

>From fbce2ad422a90c5d5fb307e43168788ab7ac7120 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sat, 3 May 2025 20:50:26 +0800
Subject: [PATCH 08/12] Replace llvm_unreachable with cast

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 39dd5141b245d..a2ffbdacb26d1 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1108,7 +1108,8 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   } else if (isa<BinaryOperator>(I)) {
     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
-  } else if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+  } else {
+    auto *II = cast<IntrinsicInst>(&I);
     IntrinsicCostAttributes ScalarICA(
         II->getIntrinsicID(), ScalarTy,
         SmallVector<Type *>(II->arg_size(), ScalarTy));
@@ -1117,8 +1118,7 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
         II->getIntrinsicID(), VecTy,
         SmallVector<Type *>(II->arg_size(), VecTy));
     VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
-  } else
-    llvm_unreachable("Unexpected instrucion type");
+  }
 
   // Get cost estimate for the insert element. This cost will factor into
   // both sequences.
@@ -1154,10 +1154,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
     Scalar = Builder.CreateCmp(Pred, V0, V1);
   else if (isa<BinaryOperator>(I))
     Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
-  else if (auto *II = dyn_cast<IntrinsicInst>(&I))
-    Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), {V0, V1});
   else
-    llvm_unreachable("Unexpected instruction type");
+    Scalar = Builder.CreateIntrinsic(
+        ScalarTy, cast<IntrinsicInst>(I).getIntrinsicID(), {V0, V1});
 
   Scalar->setName(I.getName() + ".scalar");
 
@@ -1172,11 +1171,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
     NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1);
   else if (isa<BinaryOperator>(I))
     NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
-  else if (auto *II = dyn_cast<IntrinsicInst>(&I))
-    NewVecC =
-        Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), {VecC0, VecC1});
   else
-    llvm_unreachable("Unexpected instruction type");
+    NewVecC = Builder.CreateIntrinsic(
+        VecTy, cast<IntrinsicInst>(I).getIntrinsicID(), {VecC0, VecC1});
   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
   replaceValue(I, *Insert);
   return true;

>From c2f403d0c11065d748e85a5eecf4aa0c8236702b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sat, 3 May 2025 23:00:52 +0800
Subject: [PATCH 09/12] Move tests out of RISC-V folder

---
 .../Transforms/VectorCombine/{RISCV => }/intrinsic-scalarize.ll   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/Transforms/VectorCombine/{RISCV => }/intrinsic-scalarize.ll (100%)

diff --git a/llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
similarity index 100%
rename from llvm/test/Transforms/VectorCombine/RISCV/intrinsic-scalarize.ll
rename to llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll

>From 436152ec3b668a7df45e64ca018367f74297576b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 29 Apr 2025 22:27:58 +0800
Subject: [PATCH 10/12] [VectorCombine][X86] Use updated getVectorInstrCost
 hook

This addresses a TODO where previously scalarizeBinopOrCmp conservatively bailed if one of the operands was a load.

getVectorInstrCost was updated to take in values in https://reviews.llvm.org/D140498 so we can pass in the scalar value to be inserted, which should return an accurate cost for a gather.

We want to remove this restriction on RISC-V since this is always profitable whether or not the scalar is a load.

On X86 this seems to prevent scalarization on SSE where the index is 0, because the cost of an insertion into undef goes from 12 -> 1 with the value passed into it. Is this correct? Or is there a way to fix this in X86TTIImpl::getVectorInstrCost? cc @alexey-bataev
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 20 ++---
 .../X86/insert-binop-inseltpoison.ll          | 33 ++++---
 ...insert-binop-with-constant-inseltpoison.ll | 86 +++++++++++++------
 .../X86/insert-binop-with-constant.ll         | 86 +++++++++++++------
 .../VectorCombine/X86/insert-binop.ll         | 33 ++++---
 .../X86/scalarize-cmp-inseltpoison.ll         | 60 +++++++++----
 .../VectorCombine/X86/scalarize-cmp.ll        | 60 +++++++++----
 7 files changed, 252 insertions(+), 126 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a2ffbdacb26d1..455d6f4e4d2f5 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1080,14 +1080,6 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
       VecTy1->getElementCount().getKnownMinValue() <= Index1)
     return false;
 
-  // Bail for single insertion if it is a load.
-  // TODO: Handle this once getVectorInstrCost can cost for load/stores.
-  auto *I0 = dyn_cast_or_null<Instruction>(V0);
-  auto *I1 = dyn_cast_or_null<Instruction>(V1);
-  if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
-      (IsConst1 && I0 && I0->mayReadFromMemory()))
-    return false;
-
   uint64_t Index = IsConst0 ? Index1 : Index0;
   Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
   Type *VecTy = I.getType();
@@ -1124,11 +1116,15 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   // both sequences.
   InstructionCost InsertCost = TTI.getVectorInstrCost(
       Instruction::InsertElement, VecTy, CostKind, Index);
-  InstructionCost OldCost =
-      (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
+  InstructionCost InsertCostV0 = TTI.getVectorInstrCost(
+      Instruction::InsertElement, VecTy, CostKind, Index, VecC0, V0);
+  InstructionCost InsertCostV1 = TTI.getVectorInstrCost(
+      Instruction::InsertElement, VecTy, CostKind, Index, VecC1, V1);
+  InstructionCost OldCost = (IsConst0 ? 0 : InsertCostV0) +
+                            (IsConst1 ? 0 : InsertCostV1) + VectorOpCost;
   InstructionCost NewCost = ScalarOpCost + InsertCost +
-                            (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
-                            (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
+                            (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCostV0) +
+                            (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCostV1);
 
   // We want to scalarize unless the vector variant actually has lower cost.
   if (OldCost < NewCost || !NewCost.isValid())
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-inseltpoison.ll
index c1100780254c1..76440c7047059 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-inseltpoison.ll
@@ -8,10 +8,16 @@ declare void @usef(<4 x float>)
 ; Eliminating an insert is profitable.
 
 define <16 x i8> @ins0_ins0_add(i8 %x, i8 %y) {
-; CHECK-LABEL: @ins0_ins0_add(
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = add i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x i8> poison, i8 [[R_SCALAR]], i64 0
-; CHECK-NEXT:    ret <16 x i8> [[R]]
+; SSE-LABEL: @ins0_ins0_add(
+; SSE-NEXT:    [[I0:%.*]] = insertelement <16 x i8> poison, i8 [[X:%.*]], i32 0
+; SSE-NEXT:    [[I1:%.*]] = insertelement <16 x i8> poison, i8 [[Y:%.*]], i32 0
+; SSE-NEXT:    [[R:%.*]] = add <16 x i8> [[I0]], [[I1]]
+; SSE-NEXT:    ret <16 x i8> [[R]]
+;
+; AVX-LABEL: @ins0_ins0_add(
+; AVX-NEXT:    [[R_SCALAR:%.*]] = add i8 [[X:%.*]], [[Y:%.*]]
+; AVX-NEXT:    [[R:%.*]] = insertelement <16 x i8> poison, i8 [[R_SCALAR]], i64 0
+; AVX-NEXT:    ret <16 x i8> [[R]]
 ;
   %i0 = insertelement <16 x i8> poison, i8 %x, i32 0
   %i1 = insertelement <16 x i8> poison, i8 %y, i32 0
@@ -155,12 +161,19 @@ define <2 x i64> @ins1_ins1_urem(i64 %x, i64 %y) {
 ; Extra use is accounted for in cost calculation.
 
 define <4 x i32> @ins0_ins0_xor(i32 %x, i32 %y) {
-; CHECK-LABEL: @ins0_ins0_xor(
-; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
-; CHECK-NEXT:    call void @use(<4 x i32> [[I0]])
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = xor i32 [[X]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[R_SCALAR]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE-LABEL: @ins0_ins0_xor(
+; SSE-NEXT:    [[I0:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; SSE-NEXT:    call void @use(<4 x i32> [[I0]])
+; SSE-NEXT:    [[I1:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i32 0
+; SSE-NEXT:    [[R:%.*]] = xor <4 x i32> [[I0]], [[I1]]
+; SSE-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX-LABEL: @ins0_ins0_xor(
+; AVX-NEXT:    [[I0:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; AVX-NEXT:    call void @use(<4 x i32> [[I0]])
+; AVX-NEXT:    [[R_SCALAR:%.*]] = xor i32 [[X]], [[Y:%.*]]
+; AVX-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[R_SCALAR]], i64 0
+; AVX-NEXT:    ret <4 x i32> [[R]]
 ;
   %i0 = insertelement <4 x i32> poison, i32 %x, i32 0
   call void @use(<4 x i32> %i0)
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
index 05251cb829b2b..751539aa0f431 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
@@ -3,10 +3,15 @@
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 define <2 x i64> @add_constant(i64 %x) {
-; CHECK-LABEL: @add_constant(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = add i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @add_constant(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 undef>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @add_constant(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = add i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> poison, i64 %x, i32 0
   %bo = add <2 x i64> %ins, <i64 42, i64 undef>
@@ -14,10 +19,15 @@ define <2 x i64> @add_constant(i64 %x) {
 }
 
 define <2 x i64> @add_constant_not_undef_lane(i64 %x) {
-; CHECK-LABEL: @add_constant_not_undef_lane(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = add i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @add_constant_not_undef_lane(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 -42>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @add_constant_not_undef_lane(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = add i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> poison, i64 %x, i32 0
   %bo = add <2 x i64> %ins, <i64 42, i64 -42>
@@ -153,8 +163,8 @@ define <2 x i64> @shl_constant_op0_not_undef_lane(i64 %x) {
 define <2 x i64> @shl_constant_op0_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op0_load(
 ; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 1
-; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i64> <i64 undef, i64 2>, [[INS]]
+; CHECK-NEXT:    [[BO_SCALAR:%.*]] = shl i64 2, [[LD]]
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ld = load i64, ptr %p
@@ -204,8 +214,8 @@ define <2 x i64> @shl_constant_op1_not_undef_lane(i64 %x) {
 define <2 x i64> @shl_constant_op1_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op1_load(
 ; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
-; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i64> [[INS]], <i64 5, i64 2>
+; CHECK-NEXT:    [[BO_SCALAR:%.*]] = shl nuw i64 [[LD]], 5
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ld = load i64, ptr %p
@@ -479,10 +489,15 @@ define <2 x i64> @sdiv_constant_op1_not_undef_lane(i64 %x) {
 }
 
 define <2 x i64> @and_constant(i64 %x) {
-; CHECK-LABEL: @and_constant(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = and i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @and_constant(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = and <2 x i64> [[INS]], <i64 42, i64 undef>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @and_constant(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = and i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> poison, i64 %x, i32 0
   %bo = and <2 x i64> %ins, <i64 42, i64 undef>
@@ -490,10 +505,15 @@ define <2 x i64> @and_constant(i64 %x) {
 }
 
 define <2 x i64> @and_constant_not_undef_lane(i64 %x) {
-; CHECK-LABEL: @and_constant_not_undef_lane(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = and i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @and_constant_not_undef_lane(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = and <2 x i64> [[INS]], <i64 42, i64 -42>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @and_constant_not_undef_lane(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = and i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> poison, i64 %x, i32 0
   %bo = and <2 x i64> %ins, <i64 42, i64 -42>
@@ -523,10 +543,15 @@ define <2 x i64> @or_constant_not_undef_lane(i64 %x) {
 }
 
 define <2 x i64> @xor_constant(i64 %x) {
-; CHECK-LABEL: @xor_constant(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = xor i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @xor_constant(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = xor <2 x i64> [[INS]], <i64 42, i64 undef>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @xor_constant(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = xor i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> poison, i64 %x, i32 0
   %bo = xor <2 x i64> %ins, <i64 42, i64 undef>
@@ -534,10 +559,15 @@ define <2 x i64> @xor_constant(i64 %x) {
 }
 
 define <2 x i64> @xor_constant_not_undef_lane(i64 %x) {
-; CHECK-LABEL: @xor_constant_not_undef_lane(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = xor i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @xor_constant_not_undef_lane(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = xor <2 x i64> [[INS]], <i64 42, i64 -42>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @xor_constant_not_undef_lane(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = xor i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> poison, i64 %x, i32 0
   %bo = xor <2 x i64> %ins, <i64 42, i64 -42>
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
index bbdd76c58b58e..2b4db0583e69c 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
@@ -3,10 +3,15 @@
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 define <2 x i64> @add_constant(i64 %x) {
-; CHECK-LABEL: @add_constant(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = add i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @add_constant(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 undef>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @add_constant(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = add i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bo = add <2 x i64> %ins, <i64 42, i64 undef>
@@ -14,10 +19,15 @@ define <2 x i64> @add_constant(i64 %x) {
 }
 
 define <2 x i64> @add_constant_not_undef_lane(i64 %x) {
-; CHECK-LABEL: @add_constant_not_undef_lane(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = add i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @add_constant_not_undef_lane(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 -42>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @add_constant_not_undef_lane(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = add i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bo = add <2 x i64> %ins, <i64 42, i64 -42>
@@ -153,8 +163,8 @@ define <2 x i64> @shl_constant_op0_not_undef_lane(i64 %x) {
 define <2 x i64> @shl_constant_op0_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op0_load(
 ; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 1
-; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i64> <i64 undef, i64 2>, [[INS]]
+; CHECK-NEXT:    [[BO_SCALAR:%.*]] = shl i64 2, [[LD]]
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ld = load i64, ptr %p
@@ -204,8 +214,8 @@ define <2 x i64> @shl_constant_op1_not_undef_lane(i64 %x) {
 define <2 x i64> @shl_constant_op1_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op1_load(
 ; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
-; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i64> [[INS]], <i64 5, i64 2>
+; CHECK-NEXT:    [[BO_SCALAR:%.*]] = shl nuw i64 [[LD]], 5
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ld = load i64, ptr %p
@@ -479,10 +489,15 @@ define <2 x i64> @sdiv_constant_op1_not_undef_lane(i64 %x) {
 }
 
 define <2 x i64> @and_constant(i64 %x) {
-; CHECK-LABEL: @and_constant(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = and i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 0, i64 undef>, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @and_constant(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = and <2 x i64> [[INS]], <i64 42, i64 undef>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @and_constant(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = and i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 0, i64 undef>, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bo = and <2 x i64> %ins, <i64 42, i64 undef>
@@ -490,10 +505,15 @@ define <2 x i64> @and_constant(i64 %x) {
 }
 
 define <2 x i64> @and_constant_not_undef_lane(i64 %x) {
-; CHECK-LABEL: @and_constant_not_undef_lane(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = and i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @and_constant_not_undef_lane(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = and <2 x i64> [[INS]], <i64 42, i64 -42>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @and_constant_not_undef_lane(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = and i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bo = and <2 x i64> %ins, <i64 42, i64 -42>
@@ -523,10 +543,15 @@ define <2 x i64> @or_constant_not_undef_lane(i64 %x) {
 }
 
 define <2 x i64> @xor_constant(i64 %x) {
-; CHECK-LABEL: @xor_constant(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = xor i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 undef, i64 0>, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @xor_constant(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = xor <2 x i64> [[INS]], <i64 42, i64 undef>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @xor_constant(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = xor i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 undef, i64 0>, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bo = xor <2 x i64> %ins, <i64 42, i64 undef>
@@ -534,10 +559,15 @@ define <2 x i64> @xor_constant(i64 %x) {
 }
 
 define <2 x i64> @xor_constant_not_undef_lane(i64 %x) {
-; CHECK-LABEL: @xor_constant_not_undef_lane(
-; CHECK-NEXT:    [[BO_SCALAR:%.*]] = xor i64 [[X:%.*]], 42
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[BO]]
+; SSE-LABEL: @xor_constant_not_undef_lane(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
+; SSE-NEXT:    [[BO:%.*]] = xor <2 x i64> [[INS]], <i64 42, i64 -42>
+; SSE-NEXT:    ret <2 x i64> [[BO]]
+;
+; AVX-LABEL: @xor_constant_not_undef_lane(
+; AVX-NEXT:    [[BO_SCALAR:%.*]] = xor i64 [[X:%.*]], 42
+; AVX-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
+; AVX-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bo = xor <2 x i64> %ins, <i64 42, i64 -42>
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
index cd7e2ad2ca2c6..789ee7b3cdf0d 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
@@ -8,10 +8,16 @@ declare void @usef(<4 x float>)
 ; Eliminating an insert is profitable.
 
 define <16 x i8> @ins0_ins0_add(i8 %x, i8 %y) {
-; CHECK-LABEL: @ins0_ins0_add(
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = add i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x i8> undef, i8 [[R_SCALAR]], i64 0
-; CHECK-NEXT:    ret <16 x i8> [[R]]
+; SSE-LABEL: @ins0_ins0_add(
+; SSE-NEXT:    [[I0:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 0
+; SSE-NEXT:    [[I1:%.*]] = insertelement <16 x i8> undef, i8 [[Y:%.*]], i32 0
+; SSE-NEXT:    [[R:%.*]] = add <16 x i8> [[I0]], [[I1]]
+; SSE-NEXT:    ret <16 x i8> [[R]]
+;
+; AVX-LABEL: @ins0_ins0_add(
+; AVX-NEXT:    [[R_SCALAR:%.*]] = add i8 [[X:%.*]], [[Y:%.*]]
+; AVX-NEXT:    [[R:%.*]] = insertelement <16 x i8> undef, i8 [[R_SCALAR]], i64 0
+; AVX-NEXT:    ret <16 x i8> [[R]]
 ;
   %i0 = insertelement <16 x i8> undef, i8 %x, i32 0
   %i1 = insertelement <16 x i8> undef, i8 %y, i32 0
@@ -155,12 +161,19 @@ define <2 x i64> @ins1_ins1_urem(i64 %x, i64 %y) {
 ; Extra use is accounted for in cost calculation.
 
 define <4 x i32> @ins0_ins0_xor(i32 %x, i32 %y) {
-; CHECK-LABEL: @ins0_ins0_xor(
-; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
-; CHECK-NEXT:    call void @use(<4 x i32> [[I0]])
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = xor i32 [[X]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[R_SCALAR]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE-LABEL: @ins0_ins0_xor(
+; SSE-NEXT:    [[I0:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; SSE-NEXT:    call void @use(<4 x i32> [[I0]])
+; SSE-NEXT:    [[I1:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
+; SSE-NEXT:    [[R:%.*]] = xor <4 x i32> [[I0]], [[I1]]
+; SSE-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX-LABEL: @ins0_ins0_xor(
+; AVX-NEXT:    [[I0:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; AVX-NEXT:    call void @use(<4 x i32> [[I0]])
+; AVX-NEXT:    [[R_SCALAR:%.*]] = xor i32 [[X]], [[Y:%.*]]
+; AVX-NEXT:    [[R:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[R_SCALAR]], i64 0
+; AVX-NEXT:    ret <4 x i32> [[R]]
 ;
   %i0 = insertelement <4 x i32> undef, i32 %x, i32 0
   call void @use(<4 x i32> %i0)
diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
index 14b517d613de4..adef56256bc7d 100644
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 declare void @use(<4 x i32>)
 declare void @usef(<4 x float>)
@@ -8,10 +8,16 @@ declare void @usef(<4 x float>)
 ; Eliminating an insert is profitable.
 
 define <16 x i1> @ins0_ins0_i8(i8 %x, i8 %y) {
-; CHECK-LABEL: @ins0_ins0_i8(
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x i1> poison, i1 [[R_SCALAR]], i64 0
-; CHECK-NEXT:    ret <16 x i1> [[R]]
+; SSE-LABEL: @ins0_ins0_i8(
+; SSE-NEXT:    [[I0:%.*]] = insertelement <16 x i8> poison, i8 [[X:%.*]], i32 0
+; SSE-NEXT:    [[I1:%.*]] = insertelement <16 x i8> poison, i8 [[Y:%.*]], i32 0
+; SSE-NEXT:    [[R:%.*]] = icmp eq <16 x i8> [[I0]], [[I1]]
+; SSE-NEXT:    ret <16 x i1> [[R]]
+;
+; AVX-LABEL: @ins0_ins0_i8(
+; AVX-NEXT:    [[R_SCALAR:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
+; AVX-NEXT:    [[R:%.*]] = insertelement <16 x i1> poison, i1 [[R_SCALAR]], i64 0
+; AVX-NEXT:    ret <16 x i1> [[R]]
 ;
   %i0 = insertelement <16 x i8> poison, i8 %x, i32 0
   %i1 = insertelement <16 x i8> poison, i8 %y, i32 0
@@ -168,11 +174,17 @@ define <2 x i1> @constant_op1_i64_not_undef_lane(i64 %x) {
 ; negative test - load prevents the transform
 
 define <2 x i1> @constant_op1_i64_load(ptr %p) {
-; CHECK-LABEL: @constant_op1_i64_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
-; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
-; CHECK-NEXT:    ret <2 x i1> [[R]]
+; SSE-LABEL: @constant_op1_i64_load(
+; SSE-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
+; SSE-NEXT:    [[R_SCALAR:%.*]] = icmp eq i64 [[LD]], 42
+; SSE-NEXT:    [[R:%.*]] = insertelement <2 x i1> poison, i1 [[R_SCALAR]], i64 0
+; SSE-NEXT:    ret <2 x i1> [[R]]
+;
+; AVX-LABEL: @constant_op1_i64_load(
+; AVX-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
+; AVX-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
+; AVX-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
+; AVX-NEXT:    ret <2 x i1> [[R]]
 ;
   %ld = load i64, ptr %p
   %ins = insertelement <2 x i64> poison, i64 %ld, i32 0
@@ -236,10 +248,15 @@ define <2 x i1> @constant_op1_f64(double %x) {
 }
 
 define <4 x i1> @constant_op1_f32_not_undef_lane(float %x) {
-; CHECK-LABEL: @constant_op1_f32_not_undef_lane(
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp uge float [[X:%.*]], 4.200000e+01
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> poison, i1 [[R_SCALAR]], i64 0
-; CHECK-NEXT:    ret <4 x i1> [[R]]
+; SSE-LABEL: @constant_op1_f32_not_undef_lane(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 0
+; SSE-NEXT:    [[R:%.*]] = fcmp uge <4 x float> [[INS]], <float 4.200000e+01, float -4.200000e+01, float 0.000000e+00, float 1.000000e+00>
+; SSE-NEXT:    ret <4 x i1> [[R]]
+;
+; AVX-LABEL: @constant_op1_f32_not_undef_lane(
+; AVX-NEXT:    [[R_SCALAR:%.*]] = fcmp uge float [[X:%.*]], 4.200000e+01
+; AVX-NEXT:    [[R:%.*]] = insertelement <4 x i1> poison, i1 [[R_SCALAR]], i64 0
+; AVX-NEXT:    ret <4 x i1> [[R]]
 ;
   %ins = insertelement <4 x float> poison, float %x, i32 0
   %r = fcmp uge <4 x float> %ins, <float 42.0, float -42.0, float 0.0, float 1.0>
@@ -279,10 +296,15 @@ define <4 x float> @vec_select_use2(<4 x float> %x, <4 x float> %y, float %a) {
 }
 
 define <4 x i1> @vector_of_pointers(ptr %t1) {
-; CHECK-LABEL: @vector_of_pointers(
-; CHECK-NEXT:    [[T6_SCALAR:%.*]] = icmp ne ptr [[T1:%.*]], null
-; CHECK-NEXT:    [[T6:%.*]] = insertelement <4 x i1> poison, i1 [[T6_SCALAR]], i64 0
-; CHECK-NEXT:    ret <4 x i1> [[T6]]
+; SSE-LABEL: @vector_of_pointers(
+; SSE-NEXT:    [[T5:%.*]] = insertelement <4 x ptr> poison, ptr [[T1:%.*]], i32 0
+; SSE-NEXT:    [[T6:%.*]] = icmp ne <4 x ptr> [[T5]], zeroinitializer
+; SSE-NEXT:    ret <4 x i1> [[T6]]
+;
+; AVX-LABEL: @vector_of_pointers(
+; AVX-NEXT:    [[T6_SCALAR:%.*]] = icmp ne ptr [[T1:%.*]], null
+; AVX-NEXT:    [[T6:%.*]] = insertelement <4 x i1> poison, i1 [[T6_SCALAR]], i64 0
+; AVX-NEXT:    ret <4 x i1> [[T6]]
 ;
   %t5 = insertelement <4 x ptr> poison, ptr %t1, i32 0
   %t6 = icmp ne <4 x ptr> %t5, zeroinitializer
diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
index edd92c3f1c14c..0c585f20470c7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 declare void @use(<4 x i32>)
 declare void @usef(<4 x float>)
@@ -8,10 +8,16 @@ declare void @usef(<4 x float>)
 ; Eliminating an insert is profitable.
 
 define <16 x i1> @ins0_ins0_i8(i8 %x, i8 %y) {
-; CHECK-LABEL: @ins0_ins0_i8(
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x i1> undef, i1 [[R_SCALAR]], i64 0
-; CHECK-NEXT:    ret <16 x i1> [[R]]
+; SSE-LABEL: @ins0_ins0_i8(
+; SSE-NEXT:    [[I0:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 0
+; SSE-NEXT:    [[I1:%.*]] = insertelement <16 x i8> undef, i8 [[Y:%.*]], i32 0
+; SSE-NEXT:    [[R:%.*]] = icmp eq <16 x i8> [[I0]], [[I1]]
+; SSE-NEXT:    ret <16 x i1> [[R]]
+;
+; AVX-LABEL: @ins0_ins0_i8(
+; AVX-NEXT:    [[R_SCALAR:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
+; AVX-NEXT:    [[R:%.*]] = insertelement <16 x i1> undef, i1 [[R_SCALAR]], i64 0
+; AVX-NEXT:    ret <16 x i1> [[R]]
 ;
   %i0 = insertelement <16 x i8> undef, i8 %x, i32 0
   %i1 = insertelement <16 x i8> undef, i8 %y, i32 0
@@ -168,11 +174,17 @@ define <2 x i1> @constant_op1_i64_not_undef_lane(i64 %x) {
 ; negative test - load prevents the transform
 
 define <2 x i1> @constant_op1_i64_load(ptr %p) {
-; CHECK-LABEL: @constant_op1_i64_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
-; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
-; CHECK-NEXT:    ret <2 x i1> [[R]]
+; SSE-LABEL: @constant_op1_i64_load(
+; SSE-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
+; SSE-NEXT:    [[R_SCALAR:%.*]] = icmp eq i64 [[LD]], 42
+; SSE-NEXT:    [[R:%.*]] = insertelement <2 x i1> undef, i1 [[R_SCALAR]], i64 0
+; SSE-NEXT:    ret <2 x i1> [[R]]
+;
+; AVX-LABEL: @constant_op1_i64_load(
+; AVX-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
+; AVX-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
+; AVX-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
+; AVX-NEXT:    ret <2 x i1> [[R]]
 ;
   %ld = load i64, ptr %p
   %ins = insertelement <2 x i64> undef, i64 %ld, i32 0
@@ -236,10 +248,15 @@ define <2 x i1> @constant_op1_f64(double %x) {
 }
 
 define <4 x i1> @constant_op1_f32_not_undef_lane(float %x) {
-; CHECK-LABEL: @constant_op1_f32_not_undef_lane(
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp uge float [[X:%.*]], 4.200000e+01
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 0
-; CHECK-NEXT:    ret <4 x i1> [[R]]
+; SSE-LABEL: @constant_op1_f32_not_undef_lane(
+; SSE-NEXT:    [[INS:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 0
+; SSE-NEXT:    [[R:%.*]] = fcmp uge <4 x float> [[INS]], <float 4.200000e+01, float -4.200000e+01, float 0.000000e+00, float 1.000000e+00>
+; SSE-NEXT:    ret <4 x i1> [[R]]
+;
+; AVX-LABEL: @constant_op1_f32_not_undef_lane(
+; AVX-NEXT:    [[R_SCALAR:%.*]] = fcmp uge float [[X:%.*]], 4.200000e+01
+; AVX-NEXT:    [[R:%.*]] = insertelement <4 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 0
+; AVX-NEXT:    ret <4 x i1> [[R]]
 ;
   %ins = insertelement <4 x float> undef, float %x, i32 0
   %r = fcmp uge <4 x float> %ins, <float 42.0, float -42.0, float 0.0, float 1.0>
@@ -279,10 +296,15 @@ define <4 x float> @vec_select_use2(<4 x float> %x, <4 x float> %y, float %a) {
 }
 
 define <4 x i1> @vector_of_pointers(ptr %t1) {
-; CHECK-LABEL: @vector_of_pointers(
-; CHECK-NEXT:    [[T6_SCALAR:%.*]] = icmp ne ptr [[T1:%.*]], null
-; CHECK-NEXT:    [[T6:%.*]] = insertelement <4 x i1> undef, i1 [[T6_SCALAR]], i64 0
-; CHECK-NEXT:    ret <4 x i1> [[T6]]
+; SSE-LABEL: @vector_of_pointers(
+; SSE-NEXT:    [[T5:%.*]] = insertelement <4 x ptr> undef, ptr [[T1:%.*]], i32 0
+; SSE-NEXT:    [[T6:%.*]] = icmp ne <4 x ptr> [[T5]], zeroinitializer
+; SSE-NEXT:    ret <4 x i1> [[T6]]
+;
+; AVX-LABEL: @vector_of_pointers(
+; AVX-NEXT:    [[T6_SCALAR:%.*]] = icmp ne ptr [[T1:%.*]], null
+; AVX-NEXT:    [[T6:%.*]] = insertelement <4 x i1> undef, i1 [[T6_SCALAR]], i64 0
+; AVX-NEXT:    ret <4 x i1> [[T6]]
 ;
   %t5 = insertelement <4 x ptr> undef, ptr %t1, i32 0
   %t6 = icmp ne <4 x ptr> %t5, zeroinitializer

>From e6c15490791d9829b1fe84e25a694a5166689bb4 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 30 Apr 2025 18:47:22 +0800
Subject: [PATCH 11/12] Update phaseordering tests

---
 .../X86/scalarization-inseltpoison.ll         | 26 ++++++++++++-------
 .../PhaseOrdering/X86/scalarization.ll        | 26 ++++++++++++-------
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
index d36da8d028c60..6319e977bf35a 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
@@ -12,21 +12,29 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w, i32 %p, i32 %j, i32 %u) {
 ; CHECK-LABEL: @square(
 ; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[K:%.*]], 2
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[DIV]], i64 0
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[P:%.*]], 6234
+; CHECK-NEXT:    [[SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
 ; CHECK-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[H:%.*]], 75
+; CHECK-NEXT:    [[SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[MUL5]], i64 0
 ; CHECK-NEXT:    [[DIV9:%.*]] = sdiv i32 [[J:%.*]], 3452
+; CHECK-NEXT:    [[SPLATINSERT10:%.*]] = insertelement <4 x i32> poison, i32 [[DIV9]], i64 0
 ; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
+; CHECK-NEXT:    [[SPLATINSERT14:%.*]] = insertelement <4 x i32> poison, i32 [[MUL13]], i64 0
 ; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
+; CHECK-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> poison, i32 [[DIV17]], i64 0
 ; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426
-; CHECK-NEXT:    [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]]
-; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]]
-; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]]
-; CHECK-NEXT:    [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]]
-; CHECK-NEXT:    [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]]
-; CHECK-NEXT:    [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]]
-; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0
+; CHECK-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> poison, i32 [[MUL21]], i64 0
+; CHECK-NEXT:    [[SPLATINSERT25:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[SPLATINSERT25]], <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[SPLATINSERT18]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP10]], [[SPLATINSERT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[SPLATINSERT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[SPLATINSERT14]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[SPLATINSERT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[SPLATINSERT10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[SPLATINSERT22]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[TMP8]], <i32 317425, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[ADD29]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
index c3131a41c2b2e..5922b34985815 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
@@ -12,21 +12,29 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w, i32 %p, i32 %j, i32 %u) {
 ; CHECK-LABEL: @square(
 ; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[K:%.*]], 2
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[DIV]], i64 0
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[P:%.*]], 6234
+; CHECK-NEXT:    [[SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
 ; CHECK-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[H:%.*]], 75
+; CHECK-NEXT:    [[SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[MUL5]], i64 0
 ; CHECK-NEXT:    [[DIV9:%.*]] = sdiv i32 [[J:%.*]], 3452
+; CHECK-NEXT:    [[SPLATINSERT10:%.*]] = insertelement <4 x i32> poison, i32 [[DIV9]], i64 0
 ; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
+; CHECK-NEXT:    [[SPLATINSERT14:%.*]] = insertelement <4 x i32> poison, i32 [[MUL13]], i64 0
 ; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
+; CHECK-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> poison, i32 [[DIV17]], i64 0
 ; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426
-; CHECK-NEXT:    [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]]
-; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]]
-; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]]
-; CHECK-NEXT:    [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]]
-; CHECK-NEXT:    [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]]
-; CHECK-NEXT:    [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]]
-; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0
+; CHECK-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> poison, i32 [[MUL21]], i64 0
+; CHECK-NEXT:    [[SPLATINSERT25:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[SPLATINSERT25]], <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[SPLATINSERT18]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP10]], [[SPLATINSERT6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[SPLATINSERT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[SPLATINSERT14]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[SPLATINSERT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[SPLATINSERT10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[SPLATINSERT22]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[TMP8]], <i32 317425, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[ADD29]]

>From 2698f8ae3037b77c8625914b44ce4817001f7f99 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sat, 3 May 2025 22:24:03 +0800
Subject: [PATCH 12/12] WIP support n-ary intrinsics

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 137 +++++++++---------
 .../VectorCombine/intrinsic-scalarize.ll      |  56 +++++++
 2 files changed, 126 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 455d6f4e4d2f5..a00f81613c59d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -1021,24 +1022,17 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
 /// inserted scalar operand and convert to scalar binop/cmp/intrinsic followed
 /// by insertelement.
 bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
-  CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
-  Value *Ins0, *Ins1;
-  if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
-      !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) {
-    // TODO: Allow unary and ternary intrinsics
-    // TODO: Allow intrinsics with different argument types
-    // TODO: Allow intrinsics with scalar arguments
-    if (auto *II = dyn_cast<IntrinsicInst>(&I);
-        II && II->arg_size() == 2 &&
-        isTriviallyVectorizable(II->getIntrinsicID()) &&
-        all_of(II->args(),
-               [&II](Value *Arg) { return Arg->getType() == II->getType(); })) {
-      Ins0 = II->getArgOperand(0);
-      Ins1 = II->getArgOperand(1);
-    } else {
+  // TODO: Allow unary operators
+  if (!isa<BinaryOperator, CmpInst, IntrinsicInst>(I))
+    return false;
+
+  // TODO: Allow intrinsics with different argument types
+  // TODO: Allow intrinsics with scalar arguments
+  if (auto *II = dyn_cast<IntrinsicInst>(&I))
+    if (!isTriviallyVectorizable(II->getIntrinsicID()) ||
+        !all_of(II->args(),
+                [&II](Value *Arg) { return Arg->getType() == II->getType(); }))
       return false;
-    }
-  }
 
   // Do not convert the vector condition of a vector select into a scalar
   // condition. That may cause problems for codegen because of differences in
@@ -1055,36 +1049,43 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   // vec_op (inselt VecC0, V0, Index), VecC1
   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
   // TODO: Deal with mismatched index constants and variable indexes?
-  Constant *VecC0 = nullptr, *VecC1 = nullptr;
-  Value *V0 = nullptr, *V1 = nullptr;
-  uint64_t Index0 = 0, Index1 = 0;
-  if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
-                               m_ConstantInt(Index0))) &&
-      !match(Ins0, m_Constant(VecC0)))
-    return false;
-  if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
-                               m_ConstantInt(Index1))) &&
-      !match(Ins1, m_Constant(VecC1)))
-    return false;
-
-  bool IsConst0 = !V0;
-  bool IsConst1 = !V1;
-  if (IsConst0 && IsConst1)
-    return false;
-  if (!IsConst0 && !IsConst1 && Index0 != Index1)
-    return false;
+  SmallVector<Value *> VecCs, ScalarOps;
+  std::optional<uint64_t> Index;
+
+  auto Ops = isa<IntrinsicInst>(I) ? cast<IntrinsicInst>(I).args()
+                                   : I.operand_values();
+  for (Value *Op : Ops) {
+    Constant *VecC;
+    Value *V;
+    uint64_t InsIdx = 0;
+    VectorType *OpTy = cast<VectorType>(Op->getType());
+    if (match(Op, m_InsertElt(m_Constant(VecC), m_Value(V),
+                              m_ConstantInt(InsIdx)))) {
+      // Bail if any inserts are out of bounds.
+      if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
+        return false;
+      // All inserts must have the same index.
+      if (!Index)
+        Index = InsIdx;
+      else if (InsIdx != *Index)
+        return false;
+      VecCs.push_back(VecC);
+      ScalarOps.push_back(V);
+    } else if (match(Op, m_Constant(VecC))) {
+      VecCs.push_back(VecC);
+      ScalarOps.push_back(nullptr);
+    } else {
+      return false;
+    }
+  }
 
-  auto *VecTy0 = cast<VectorType>(Ins0->getType());
-  auto *VecTy1 = cast<VectorType>(Ins1->getType());
-  if (VecTy0->getElementCount().getKnownMinValue() <= Index0 ||
-      VecTy1->getElementCount().getKnownMinValue() <= Index1)
+  // Bail if all operands are constant.
+  if (!Index.has_value())
     return false;
 
-  uint64_t Index = IsConst0 ? Index1 : Index0;
-  Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
-  Type *VecTy = I.getType();
+  VectorType *VecTy = cast<VectorType>(I.getType());
+  Type *ScalarTy = VecTy->getScalarType();
   assert(VecTy->isVectorTy() &&
-         (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&
          (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
           ScalarTy->isPointerTy()) &&
          "Unexpected types for insert element into binop or cmp");
@@ -1114,17 +1115,18 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   // Get cost estimate for the insert element. This cost will factor into
   // both sequences.
-  InstructionCost InsertCost = TTI.getVectorInstrCost(
-      Instruction::InsertElement, VecTy, CostKind, Index);
-  InstructionCost InsertCostV0 = TTI.getVectorInstrCost(
-      Instruction::InsertElement, VecTy, CostKind, Index, VecC0, V0);
-  InstructionCost InsertCostV1 = TTI.getVectorInstrCost(
-      Instruction::InsertElement, VecTy, CostKind, Index, VecC1, V1);
-  InstructionCost OldCost = (IsConst0 ? 0 : InsertCostV0) +
-                            (IsConst1 ? 0 : InsertCostV1) + VectorOpCost;
-  InstructionCost NewCost = ScalarOpCost + InsertCost +
-                            (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCostV0) +
-                            (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCostV1);
+  InstructionCost OldCost = VectorOpCost;
+  InstructionCost NewCost =
+      ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
+                                            CostKind, *Index);
+  for (auto [Op, VecC, Scalar] : zip(Ops, VecCs, ScalarOps)) {
+    if (!Scalar)
+      continue;
+    InstructionCost InsertCost = TTI.getVectorInstrCost(
+        Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
+    OldCost += InsertCost;
+    NewCost += !Op->hasOneUse() * InsertCost;
+  }
 
   // We want to scalarize unless the vector variant actually has lower cost.
   if (OldCost < NewCost || !NewCost.isValid())
@@ -1140,19 +1142,20 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
     ++NumScalarIntrinsic;
 
   // For constant cases, extract the scalar element, this should constant fold.
-  if (IsConst0)
-    V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
-  if (IsConst1)
-    V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
+  for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
+    if (!Scalar)
+      ScalarOps[OpIdx] = ConstantExpr::getExtractElement(
+          cast<Constant>(VecC), Builder.getInt64(*Index));
 
   Value *Scalar;
-  if (isa<CmpInst>(I))
-    Scalar = Builder.CreateCmp(Pred, V0, V1);
+  if (auto *CI = dyn_cast<CmpInst>(&I))
+    Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
   else if (isa<BinaryOperator>(I))
-    Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+    Scalar = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, ScalarOps[0],
+                                 ScalarOps[1]);
   else
     Scalar = Builder.CreateIntrinsic(
-        ScalarTy, cast<IntrinsicInst>(I).getIntrinsicID(), {V0, V1});
+        ScalarTy, cast<IntrinsicInst>(I).getIntrinsicID(), ScalarOps);
 
   Scalar->setName(I.getName() + ".scalar");
 
@@ -1163,14 +1166,14 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   // Fold the vector constants in the original vectors into a new base vector.
   Value *NewVecC;
-  if (isa<CmpInst>(I))
-    NewVecC = Builder.CreateCmp(Pred, VecC0, VecC1);
+  if (auto *CI = dyn_cast<CmpInst>(&I))
+    NewVecC = Builder.CreateCmp(CI->getPredicate(), VecCs[0], VecCs[1]);
   else if (isa<BinaryOperator>(I))
-    NewVecC = Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
+    NewVecC = Builder.CreateNAryOp(Opcode, VecCs);
   else
     NewVecC = Builder.CreateIntrinsic(
-        VecTy, cast<IntrinsicInst>(I).getIntrinsicID(), {VecC0, VecC1});
-  Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
+        VecTy, cast<IntrinsicInst>(I).getIntrinsicID(), VecCs);
+  Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
   replaceValue(I, *Insert);
   return true;
 }
diff --git a/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
index e7683d72a052d..58b7f8de004d0 100644
--- a/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
@@ -96,6 +96,62 @@ define <4 x i32> @non_trivially_vectorizable(i32 %x, i32 %y) {
   ret <4 x i32> %v
 }
 
+define <4 x float> @fabs_fixed(float %x) {
+; CHECK-LABEL: define <4 x float> @fabs_fixed(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+  %x.insert = insertelement <4 x float> poison, float %x, i32 0
+  %v = call <4 x float> @llvm.fabs(<4 x float> %x.insert)
+  ret <4 x float> %v
+}
+
+define <vscale x 4 x float> @fabs_scalable(float %x) {
+; CHECK-LABEL: define <vscale x 4 x float> @fabs_scalable(
+; CHECK-SAME: float [[X:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <vscale x 4 x float> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x float> poison, float %x, i32 0
+  %v = call <vscale x 4 x float> @llvm.fabs(<vscale x 4 x float> %x.insert)
+  ret <vscale x 4 x float> %v
+}
+
+define <4 x float> @fma_fixed(float %x, float %y, float %z) {
+; CHECK-LABEL: define <4 x float> @fma_fixed(
+; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[V]]
+;
+  %x.insert = insertelement <4 x float> poison, float %x, i32 0
+  %y.insert = insertelement <4 x float> poison, float %y, i32 0
+  %z.insert = insertelement <4 x float> poison, float %z, i32 0
+  %v = call <4 x float> @llvm.fma(<4 x float> %x.insert, <4 x float> %y.insert, <4 x float> %z.insert)
+  ret <4 x float> %v
+}
+
+define <vscale x 4 x float> @fma_scalable(float %x, float %y, float %z) {
+; CHECK-LABEL: define <vscale x 4 x float> @fma_scalable(
+; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) {
+; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    ret <vscale x 4 x float> [[V]]
+;
+  %x.insert = insertelement <vscale x 4 x float> poison, float %x, i32 0
+  %y.insert = insertelement <vscale x 4 x float> poison, float %y, i32 0
+  %z.insert = insertelement <vscale x 4 x float> poison, float %z, i32 0
+  %v = call <vscale x 4 x float> @llvm.fma(<vscale x 4 x float> %x.insert, <vscale x 4 x float> %y.insert, <vscale x 4 x float> %z.insert)
+  ret <vscale x 4 x float> %v
+}
+
 ; TODO: We should be able to scalarize this if we preserve the scalar argument.
 define <4 x float> @scalar_argument(float %x) {
 ; CHECK-LABEL: define <4 x float> @scalar_argument(