[llvm] [VectorCombine] Scalarize bin ops and cmps with two splatted operands (PR #137786)

Tue Apr 29 03:49:45 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Luke Lau (lukel97)

<details>
<summary>Changes</summary>

Consider this C loop, simplified from a case in SPEC CPU 2017:

```c
void f(int *x, int *y) {
  for (int i = 0; i < 1024; i++)
    x[i] += *y * 2;
}
```

On RISC-V we will currently generate this loop:

    	lw	a5, 0(a1)
    	slli	a5, a5, 1
    	vsetvli	a2, zero, e32, m2, ta, ma
    	vmv.v.x	v8, a5
    	...
    .LBB0_8:                                # %vector.body
                                            # =>This Inner Loop Header: Depth=1
    	vl2re32.v	v10, (a5)
    	vadd.vv	v10, v10, v8
    	vs2r.v	v10, (a5)
    	add	a5, a5, a4
    	bne	a5, a3, .LBB0_8

We could avoid the splat `vmv.v.x v8, a5` by using `vadd.vx v10, v10, a5`, but this doesn't happen today:

1.  The pointers aren't marked `noalias`, so LICM can't initially hoist out the `*y * 2`.
2.  The loop vectorizer vectorizes the loop with an aliasing check. `*y * 2` is now vectorized.
3.  With the aliasing checks, LICM can now hoist the vectorized `*y * 2` into the preheader.
4.  DAGCombiner will go ahead and scalarize the `*y * 2` once it detects it is a splat. But this is too late as we can no longer sink the splat into the loop body, so we can't match `vadd.vx`.

However DAGCombiner isn't the only place that can scalarize bin ops of splats. `VectorCombine::scalarizeBinopOrCmp` can also scalarize bin ops, and because it's before CodeGenPrepare this allows the splat to be sunk into the loop body.

However it only matches the canonical form that InstCombine produces for two ****non-constant**** splatted operands, e.g. `x[i] += *y * *z`:

```llvm
    %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i64 0
    %y.insert = insertelement <vscale x 4 x i32> poison, i32 %y, i64 0
    %0 = add <vscale x 4 x i32> %x.insert, %y.insert
    %res = shufflevector <vscale x 4 x i32> %0, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
```

If one operand is constant, i.e. `x[i] += *y * 2`, then the IR will be in the form:

```llvm
    %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i64 0
    %x.splat = shufflevector <vscale x 4 x i32> %x.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
    %res = add <vscale x 4 x i32> %x.splat, splat (i32 42)
```

This patch teaches VectorCombine to handle this case, and in doing so allows the splat to be avoided on RISC-V.

I initially created a separate transform for this but there ended up being a lot of duplicated code, so I've tried my best to reuse the existing transform which operates on insertelements.

Since there doesn't seem to be a dedicated "splat from scalar" cost hook I've just reused the insertelement cost, which is hopefully close enough.



---
Full diff: https://github.com/llvm/llvm-project/pull/137786.diff


3 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+72-53) 
- (modified) llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll (+36-12) 
- (added) llvm/test/Transforms/VectorCombine/scalarize-binop.ll (+94) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 04c084ffdda97..cc6ef1656914c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1035,50 +1035,61 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
       if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
         return false;
 
-  // Match against one or both scalar values being inserted into constant
-  // vectors:
-  // vec_op VecC0, (inselt VecC1, V1, Index)
-  // vec_op (inselt VecC0, V0, Index), VecC1
-  // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
-  // TODO: Deal with mismatched index constants and variable indexes?
   Constant *VecC0 = nullptr, *VecC1 = nullptr;
   Value *V0 = nullptr, *V1 = nullptr;
-  uint64_t Index0 = 0, Index1 = 0;
-  if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
-                               m_ConstantInt(Index0))) &&
-      !match(Ins0, m_Constant(VecC0)))
-    return false;
-  if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
-                               m_ConstantInt(Index1))) &&
-      !match(Ins1, m_Constant(VecC1)))
-    return false;
+  std::optional<uint64_t> Index;
+
+  // Try and match against two splatted operands first.
+  // vec_op (splat V0), (splat V1)
+  V0 = getSplatValue(Ins0);
+  V1 = getSplatValue(Ins1);
+  if (!V0 || !V1) {
+    // Match against one or both scalar values being inserted into constant
+    // vectors:
+    // vec_op VecC0, (inselt VecC1, V1, Index)
+    // vec_op (inselt VecC0, V0, Index), VecC1
+    // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
+    // TODO: Deal with mismatched index constants and variable indexes?
+    V0 = nullptr, V1 = nullptr;
+    uint64_t Index0 = 0, Index1 = 0;
+    if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
+                                 m_ConstantInt(Index0))) &&
+        !match(Ins0, m_Constant(VecC0)))
+      return false;
+    if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
+                                 m_ConstantInt(Index1))) &&
+        !match(Ins1, m_Constant(VecC1)))
+      return false;
 
-  bool IsConst0 = !V0;
-  bool IsConst1 = !V1;
-  if (IsConst0 && IsConst1)
-    return false;
-  if (!IsConst0 && !IsConst1 && Index0 != Index1)
-    return false;
+    bool IsConst0 = !V0;
+    bool IsConst1 = !V1;
+    if (IsConst0 && IsConst1)
+      return false;
+    if (!IsConst0 && !IsConst1 && Index0 != Index1)
+      return false;
 
-  auto *VecTy0 = cast<VectorType>(Ins0->getType());
-  auto *VecTy1 = cast<VectorType>(Ins1->getType());
-  if (VecTy0->getElementCount().getKnownMinValue() <= Index0 ||
-      VecTy1->getElementCount().getKnownMinValue() <= Index1)
-    return false;
+    auto *VecTy0 = cast<VectorType>(Ins0->getType());
+    auto *VecTy1 = cast<VectorType>(Ins1->getType());
+    if (VecTy0->getElementCount().getKnownMinValue() <= Index0 ||
+        VecTy1->getElementCount().getKnownMinValue() <= Index1)
+      return false;
 
-  // Bail for single insertion if it is a load.
-  // TODO: Handle this once getVectorInstrCost can cost for load/stores.
-  auto *I0 = dyn_cast_or_null<Instruction>(V0);
-  auto *I1 = dyn_cast_or_null<Instruction>(V1);
-  if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
-      (IsConst1 && I0 && I0->mayReadFromMemory()))
-    return false;
+    // Bail for single insertion if it is a load.
+    // TODO: Handle this once getVectorInstrCost can cost for load/stores.
+    auto *I0 = dyn_cast_or_null<Instruction>(V0);
+    auto *I1 = dyn_cast_or_null<Instruction>(V1);
+    if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
+        (IsConst1 && I0 && I0->mayReadFromMemory()))
+      return false;
+
+    Index = IsConst0 ? Index1 : Index0;
+  }
 
-  uint64_t Index = IsConst0 ? Index1 : Index0;
-  Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
-  Type *VecTy = I.getType();
+  auto *VecTy = cast<VectorType>(I.getType());
+  Type *ScalarTy = VecTy->getElementType();
   assert(VecTy->isVectorTy() &&
-         (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&
+         (isa<Constant>(Ins0) || isa<Constant>(Ins1) ||
+          V0->getType() == V1->getType()) &&
          (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
           ScalarTy->isPointerTy()) &&
          "Unexpected types for insert element into binop or cmp");
@@ -1099,12 +1110,14 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   // Get cost estimate for the insert element. This cost will factor into
   // both sequences.
   InstructionCost InsertCost = TTI.getVectorInstrCost(
-      Instruction::InsertElement, VecTy, CostKind, Index);
-  InstructionCost OldCost =
-      (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
-  InstructionCost NewCost = ScalarOpCost + InsertCost +
-                            (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
-                            (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
+      Instruction::InsertElement, VecTy, CostKind, Index.value_or(0));
+  InstructionCost OldCost = (isa<Constant>(Ins0) ? 0 : InsertCost) +
+                            (isa<Constant>(Ins1) ? 0 : InsertCost) +
+                            VectorOpCost;
+  InstructionCost NewCost =
+      ScalarOpCost + InsertCost +
+      (isa<Constant>(Ins0) ? 0 : !Ins0->hasOneUse() * InsertCost) +
+      (isa<Constant>(Ins1) ? 0 : !Ins1->hasOneUse() * InsertCost);
 
   // We want to scalarize unless the vector variant actually has lower cost.
   if (OldCost < NewCost || !NewCost.isValid())
@@ -1112,16 +1125,18 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 
   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
   // inselt NewVecC, (scalar_op V0, V1), Index
+  //
+  // vec_op (splat V0), (splat V1) --> splat (scalar_op V0, V1)
   if (IsCmp)
     ++NumScalarCmp;
   else
     ++NumScalarBO;
 
   // For constant cases, extract the scalar element, this should constant fold.
-  if (IsConst0)
-    V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
-  if (IsConst1)
-    V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
+  if (Index && isa<Constant>(Ins0))
+    V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(*Index));
+  if (Index && isa<Constant>(Ins1))
+    V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(*Index));
 
   Value *Scalar =
       IsCmp ? Builder.CreateCmp(Pred, V0, V1)
@@ -1134,12 +1149,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
     ScalarInst->copyIRFlags(&I);
 
-  // Fold the vector constants in the original vectors into a new base vector.
-  Value *NewVecC =
-      IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
-            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1);
-  Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
-  replaceValue(I, *Insert);
+  Value *Result;
+  if (Index) {
+    // Fold the vector constants in the original vectors into a new base vector.
+    Value *NewVecC = IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1)
+                           : Builder.CreateBinOp((Instruction::BinaryOps)Opcode,
+                                                 VecC0, VecC1);
+    Result = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
+  } else
+    Result = Builder.CreateVectorSplat(VecTy->getElementCount(), Scalar);
+  replaceValue(I, *Result);
   return true;
 }
 
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
index f9108efa7ee79..0da4a72291380 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
@@ -271,12 +271,24 @@ define <4 x i32> @shuf_icmp_ugt_v4i32_use(<4 x i32> %x, <4 x i32> %y, <4 x i32>
 ; PR121110 - don't merge equivalent (but not matching) predicates
 
 define <2 x i1> @PR121110() {
-; CHECK-LABEL: define <2 x i1> @PR121110(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
-; CHECK-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
-; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[UGT]], <2 x i1> [[SGT]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    ret <2 x i1> [[RES]]
+; SSE-LABEL: define <2 x i1> @PR121110(
+; SSE-SAME: ) #[[ATTR0]] {
+; SSE-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; SSE-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[SGT]], <2 x i32> <i32 0, i32 3>
+; SSE-NEXT:    ret <2 x i1> [[RES]]
+;
+; AVX2-LABEL: define <2 x i1> @PR121110(
+; AVX2-SAME: ) #[[ATTR0]] {
+; AVX2-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; AVX2-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[SGT]], <2 x i32> <i32 0, i32 3>
+; AVX2-NEXT:    ret <2 x i1> [[RES]]
+;
+; AVX512-LABEL: define <2 x i1> @PR121110(
+; AVX512-SAME: ) #[[ATTR0]] {
+; AVX512-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
+; AVX512-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; AVX512-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[UGT]], <2 x i1> [[SGT]], <2 x i32> <i32 0, i32 3>
+; AVX512-NEXT:    ret <2 x i1> [[RES]]
 ;
   %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
   %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
@@ -285,12 +297,24 @@ define <2 x i1> @PR121110() {
 }
 
 define <2 x i1> @PR121110_commute() {
-; CHECK-LABEL: define <2 x i1> @PR121110_commute(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
-; CHECK-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[SGT]], <2 x i1> [[UGT]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    ret <2 x i1> [[RES]]
+; SSE-LABEL: define <2 x i1> @PR121110_commute(
+; SSE-SAME: ) #[[ATTR0]] {
+; SSE-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; SSE-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[SGT]], <2 x i1> zeroinitializer, <2 x i32> <i32 0, i32 3>
+; SSE-NEXT:    ret <2 x i1> [[RES]]
+;
+; AVX2-LABEL: define <2 x i1> @PR121110_commute(
+; AVX2-SAME: ) #[[ATTR0]] {
+; AVX2-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; AVX2-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[SGT]], <2 x i1> zeroinitializer, <2 x i32> <i32 0, i32 3>
+; AVX2-NEXT:    ret <2 x i1> [[RES]]
+;
+; AVX512-LABEL: define <2 x i1> @PR121110_commute(
+; AVX512-SAME: ) #[[ATTR0]] {
+; AVX512-NEXT:    [[SGT:%.*]] = icmp sgt <2 x i32> zeroinitializer, <i32 6, i32 -4>
+; AVX512-NEXT:    [[UGT:%.*]] = icmp samesign ugt <2 x i32> zeroinitializer, zeroinitializer
+; AVX512-NEXT:    [[RES:%.*]] = shufflevector <2 x i1> [[SGT]], <2 x i1> [[UGT]], <2 x i32> <i32 0, i32 3>
+; AVX512-NEXT:    ret <2 x i1> [[RES]]
 ;
   %sgt = icmp sgt <2 x i32> < i32 0, i32 0 >, < i32 6, i32 4294967292 >
   %ugt = icmp samesign ugt <2 x i32> < i32 0, i32 0 >, < i32 0, i32 0 >
diff --git a/llvm/test/Transforms/VectorCombine/scalarize-binop.ll b/llvm/test/Transforms/VectorCombine/scalarize-binop.ll
new file mode 100644
index 0000000000000..0bc3826e73902
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/scalarize-binop.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=vector-combine < %s | FileCheck %s
+
+define <4 x i32> @add_v4i32(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i32> @add_v4i32(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[RES_SCALAR:%.*]] = add i32 [[X]], 42
+; CHECK-NEXT:    [[X_HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[RES_SCALAR]], i64 0
+; CHECK-NEXT:    [[X_SPLAT:%.*]] = shufflevector <4 x i32> [[X_HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[X_SPLAT]]
+;
+  %x.head = insertelement <4 x i32> poison, i32 %x, i32 0
+  %x.splat = shufflevector <4 x i32> %x.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %res = add <4 x i32> %x.splat, splat (i32 42)
+  ret <4 x i32> %res
+}
+
+define <vscale x 4 x i32> @add_nxv4i32(i32 %x, i32 %y) {
+; CHECK-LABEL: define <vscale x 4 x i32> @add_nxv4i32(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[RES_SCALAR:%.*]] = add i32 [[X]], 42
+; CHECK-NEXT:    [[Y_HEAD1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[RES_SCALAR]], i64 0
+; CHECK-NEXT:    [[Y_SPLAT1:%.*]] = shufflevector <vscale x 4 x i32> [[Y_HEAD1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[Y_SPLAT1]]
+;
+  %x.head = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %x.splat = shufflevector <vscale x 4 x i32> %x.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %res = add <vscale x 4 x i32> %x.splat, splat (i32 42)
+  ret <vscale x 4 x i32> %res
+}
+
+; Make sure that we can scalarize sequences of vector instructions.
+define <4 x i32> @add_mul_v4i32(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: define <4 x i32> @add_mul_v4i32(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]]) {
+; CHECK-NEXT:    [[RES0_SCALAR:%.*]] = add i32 [[X]], 42
+; CHECK-NEXT:    [[RES1_SCALAR:%.*]] = mul i32 [[RES0_SCALAR]], 42
+; CHECK-NEXT:    [[Z_HEAD1:%.*]] = insertelement <4 x i32> poison, i32 [[RES1_SCALAR]], i64 0
+; CHECK-NEXT:    [[Z_SPLAT1:%.*]] = shufflevector <4 x i32> [[Z_HEAD1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[Z_SPLAT1]]
+;
+  %x.head = insertelement <4 x i32> poison, i32 %x, i32 0
+  %x.splat = shufflevector <4 x i32> %x.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %res0 = add <4 x i32> %x.splat, splat (i32 42)
+  %res1 = mul <4 x i32> %res0, splat (i32 42)
+  ret <4 x i32> %res1
+}
+
+; Shouldn't be scalarized since %x.splat and %y.splat have other users.
+define <4 x i32> @other_users_v4i32(i32 %x, i32 %y, ptr %p, ptr %q) {
+; CHECK-LABEL: define <4 x i32> @other_users_v4i32(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[RES1:%.*]] = add <4 x i32> [[RES]], splat (i32 42)
+; CHECK-NEXT:    store <4 x i32> [[RES]], ptr [[P]], align 16
+; CHECK-NEXT:    store <4 x i32> [[RES]], ptr [[Q]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[RES1]]
+;
+  %x.head = insertelement <4 x i32> poison, i32 %x, i32 0
+  %x.splat = shufflevector <4 x i32> %x.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %res = add <4 x i32> %x.splat, splat (i32 42)
+  store <4 x i32> %x.splat, ptr %p
+  store <4 x i32> %x.splat, ptr %q
+  ret <4 x i32> %res
+}
+
+define <4 x i1> @icmp_v4i32(i32 %x, i32 %y) {
+; CHECK-LABEL: define <4 x i1> @icmp_v4i32(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[RES_SCALAR:%.*]] = icmp eq i32 [[X]], 42
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[RES_SCALAR]], i64 0
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x i1> [[DOTSPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i1> [[RES]]
+;
+  %x.head = insertelement <4 x i32> poison, i32 %x, i32 0
+  %x.splat = shufflevector <4 x i32> %x.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %res = icmp eq <4 x i32> %x.splat, splat (i32 42)
+  ret <4 x i1> %res
+}
+
+define <vscale x 4 x i1> @icmp_nxv4i32(i32 %x, i32 %y) {
+; CHECK-LABEL: define <vscale x 4 x i1> @icmp_nxv4i32(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[RES_SCALAR:%.*]] = icmp eq i32 [[X]], 42
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[RES_SCALAR]], i64 0
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <vscale x 4 x i1> [[DOTSPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[RES]]
+;
+  %x.head = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+  %x.splat = shufflevector <vscale x 4 x i32> %x.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %res = icmp eq <vscale x 4 x i32> %x.splat, splat (i32 42)
+  ret <vscale x 4 x i1> %res
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/137786