[llvm] [SLP] Allow UDiv X, C <--> LShr X, log2(C) tranformations in BinOpSameOpcodeHelper (PR #181731)

Mon Feb 16 11:34:24 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Ryan Buchner (bababuck)

<details>
<summary>Changes</summary>

`UDiv` instructions by a constant power of 2 are combined to `LShr` instructions prior to the SLP Vectorizer leading to suboptimal vectorization.

Prior to this change,
`clang -O3 -march=riscv64gcv -S`
```
void foo(unsigned * restrict A, unsigned * restrict B) {
  for (unsigned i = 2; i < 6; ++i) {
    A[i] = B[i] / i;
  }
}
```
lowers to
```
define dso_local void @foo(ptr noalias noundef writeonly captures(none) initializes((8, 24)) %A, ptr noalias noundef readonly captures(none) %B) local_unnamed_addr #0 {
entry:
  %arrayidx = getelementptr inbounds nuw i8, ptr %B, i64 8
  %arrayidx2 = getelementptr inbounds nuw i8, ptr %A, i64 8
  %0 = load <4 x i32>, ptr %arrayidx, align 4, !tbaa !6
  %1 = lshr <4 x i32> %0, <i32 1, i32 0, i32 2, i32 0>
  %2 = udiv <4 x i32> %1, <i32 1, i32 3, i32 1, i32 5>
  store <4 x i32> %2, ptr %arrayidx2, align 4, !tbaa !6
  ret void
}
```
with this change, improved to:
```
define dso_local void @foo(ptr noalias noundef writeonly captures(none) initializes((8, 24)) %A, ptr noalias noundef readonly captures(none) %B) local_unnamed_addr #0 {
entry:
  %arrayidx = getelementptr inbounds nuw i8, ptr %B, i64 8
  %arrayidx2 = getelementptr inbounds nuw i8, ptr %A, i64 8
  %0 = load <4 x i32>, ptr %arrayidx, align 4, !tbaa !6
  %1 = udiv <4 x i32> %0, <i32 2, i32 3, i32 4, i32 5>
  store <4 x i32> %1, ptr %arrayidx2, align 4, !tbaa !6
  ret void
}
```

---

Patch is 34.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/181731.diff


7 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+42-13) 
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll (+7-14) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll (+4-6) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll (+13-6) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll (+4-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll (+14-12) 
- (modified) llvm/test/Transforms/SLPVectorizer/semanticly-same.ll (+324-17) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b29367fa1543f..dd4bae01629a3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -952,8 +952,9 @@ class BinOpSameOpcodeHelper {
   using MaskType = std::uint_fast16_t;
   /// Sort SupportedOp because it is used by binary_search.
   constexpr static std::initializer_list<unsigned> SupportedOp = {
-      Instruction::Add,  Instruction::Sub, Instruction::Mul, Instruction::Shl,
-      Instruction::AShr, Instruction::And, Instruction::Or,  Instruction::Xor};
+      Instruction::Add, Instruction::Sub,  Instruction::Mul,  Instruction::UDiv,
+      Instruction::Shl, Instruction::LShr, Instruction::AShr, Instruction::And,
+      Instruction::Or,  Instruction::Xor};
   static_assert(llvm::is_sorted_constexpr(SupportedOp) &&
                 "SupportedOp is not sorted.");
   enum : MaskType {
@@ -965,7 +966,9 @@ class BinOpSameOpcodeHelper {
     AndBIT = 0b100000,
     OrBIT = 0b1000000,
     XorBIT = 0b10000000,
-    MainOpBIT = 0b100000000,
+    LShrBIT = 0b100000000,
+    UDivBIT = 0b1000000000,
+    MainOpBIT = 0b10000000000,
     LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
   };
   /// Return a non-nullptr if either operand of I is a ConstantInt.
@@ -982,7 +985,8 @@ class BinOpSameOpcodeHelper {
     if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
       return {CI, 1};
     if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
-        Opcode == Instruction::AShr)
+        Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
+        Opcode == Instruction::UDiv)
       return {nullptr, 0};
     if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
       return {CI, 0};
@@ -992,7 +996,7 @@ class BinOpSameOpcodeHelper {
     const Instruction *I = nullptr;
     /// The bit it sets represents whether MainOp can be converted to.
     MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
-                    MulBIT | AShrBIT | ShlBIT;
+                    MulBIT | AShrBIT | ShlBIT | LShrBIT | UDivBIT;
     /// We cannot create an interchangeable instruction that does not exist in
     /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
     /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
@@ -1033,6 +1037,10 @@ class BinOpSameOpcodeHelper {
         return Instruction::Or;
       if (Candidate & XorBIT)
         return Instruction::Xor;
+      if (Candidate & LShrBIT)
+        return Instruction::LShr;
+      if (Candidate & UDivBIT)
+        return Instruction::UDiv;
       llvm_unreachable("Cannot find interchangeable instruction.");
     }
 
@@ -1057,11 +1065,13 @@ class BinOpSameOpcodeHelper {
       case Instruction::Xor:
         return Candidate & XorBIT;
       case Instruction::LShr:
+        return Candidate & LShrBIT;
+      case Instruction::UDiv:
+        return Candidate & UDivBIT;
       case Instruction::FAdd:
       case Instruction::FSub:
       case Instruction::FMul:
       case Instruction::SDiv:
-      case Instruction::UDiv:
       case Instruction::FDiv:
       case Instruction::SRem:
       case Instruction::URem:
@@ -1086,7 +1096,8 @@ class BinOpSameOpcodeHelper {
       Constant *RHS;
       switch (FromOpcode) {
       case Instruction::Shl:
-        if (ToOpcode == Instruction::Mul) {
+      case Instruction::LShr:
+        if (ToOpcode == Instruction::Mul || ToOpcode == Instruction::UDiv) {
           RHS = ConstantInt::get(
               RHSType, APInt::getOneBitSet(FromCIValueBitWidth,
                                            FromCIValue.getZExtValue()));
@@ -1097,8 +1108,9 @@ class BinOpSameOpcodeHelper {
         }
         break;
       case Instruction::Mul:
+      case Instruction::UDiv:
         assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
-        if (ToOpcode == Instruction::Shl) {
+        if (ToOpcode == Instruction::Shl || ToOpcode == Instruction::LShr) {
           RHS = ConstantInt::get(
               RHSType, APInt(FromCIValueBitWidth, FromCIValue.logBase2()));
         } else {
@@ -1166,7 +1178,8 @@ class BinOpSameOpcodeHelper {
            "BinOpSameOpcodeHelper only accepts BinaryOperator.");
     unsigned Opcode = I->getOpcode();
     MaskType OpcodeInMaskForm;
-    // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
+    // Prefer Shl, AShr, Mul, Add, Sub, And, Or, Xor, LShr, and UDiv over
+    // MainOp.
     switch (Opcode) {
     case Instruction::Shl:
       OpcodeInMaskForm = ShlBIT;
@@ -1192,6 +1205,12 @@ class BinOpSameOpcodeHelper {
     case Instruction::Xor:
       OpcodeInMaskForm = XorBIT;
       break;
+    case Instruction::LShr:
+      OpcodeInMaskForm = LShrBIT;
+      break;
+    case Instruction::UDiv:
+      OpcodeInMaskForm = UDivBIT;
+      break;
     default:
       return MainOp.equal(Opcode) ||
              (initializeAltOp(I) && AltOp.equal(Opcode));
@@ -1199,21 +1218,31 @@ class BinOpSameOpcodeHelper {
     MaskType InterchangeableMask = OpcodeInMaskForm;
     ConstantInt *CI = isBinOpWithConstantInt(I).first;
     if (CI) {
-      constexpr MaskType CanBeAll =
-          XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
+      constexpr MaskType CanBeAll = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
+                                    MulBIT | AShrBIT | ShlBIT | LShrBIT |
+                                    UDivBIT;
       const APInt &CIValue = CI->getValue();
       switch (Opcode) {
       case Instruction::Shl:
         if (CIValue.ult(CIValue.getBitWidth()))
           InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
         break;
+      case Instruction::LShr:
+        if (CIValue.ult(CIValue.getBitWidth()))
+          InterchangeableMask = CIValue.isZero() ? CanBeAll : UDivBIT | LShrBIT;
+        break;
       case Instruction::Mul:
+      case Instruction::UDiv:
         if (CIValue.isOne()) {
           InterchangeableMask = CanBeAll;
           break;
         }
-        if (CIValue.isPowerOf2())
-          InterchangeableMask = MulBIT | ShlBIT;
+        if (CIValue.isPowerOf2()) {
+          if (Opcode == Instruction::Mul)
+            InterchangeableMask = MulBIT | ShlBIT;
+          else // Instruction::UDiv
+            InterchangeableMask = UDivBIT | LShrBIT;
+        }
         break;
       case Instruction::Add:
       case Instruction::Sub:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll
index 2b79ca9429fa3..3c6ea5a7cce87 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll
@@ -7,25 +7,18 @@ define void @partial_vec_invalid_cost() #0 {
 ; CHECK-LABEL: define void @partial_vec_invalid_cost(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LSHR_1:%.*]] = lshr i96 0, 0
-; CHECK-NEXT:    [[LSHR_2:%.*]] = lshr i96 0, 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i96> poison, i96 [[LSHR_1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i96> [[TMP0]], i96 [[LSHR_2]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i96> [[TMP1]], i96 0, i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i96> [[TMP2]], i96 0, i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i96> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[RDX_OP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[RDX_OP]])
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[STORE_THIS:%.*]] = zext i32 [[OP_RDX3]] to i96
 ; CHECK-NEXT:    store i96 [[STORE_THIS]], ptr null, align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
-
-  %lshr.1 = lshr i96 0, 0 ; These ops
-  %lshr.2 = lshr i96 0, 0 ; return an
-  %add.0 = add i96 0, 0   ; invalid
-  %add.1 = add i96 0, 0   ; vector cost.
+  ; Test is broken, I don't think there are any pairs of binary ops that
+  ; can give an invalid cost
+  %lshr.1 = lshr i96 0, 0
+  %lshr.2 = lshr i96 0, 0
+  %add.0 = add i96 0, 0
+  %add.1 = add i96 0, 0
 
   %trunc.i96.1 = trunc i96 %lshr.1 to i32 ; These ops
   %trunc.i96.2 = trunc i96 %lshr.2 to i32 ; return an
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll
index 5e85ecd610ebd..8a6705295df81 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll
@@ -6,8 +6,8 @@ define i32 @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], %[[BB24:.*]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <4 x i32> [ [[TMP17:%.*]], %[[BB24]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ [[TMP14:%.*]], %[[BB24:.*]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <4 x i32> [ [[TMP13:%.*]], %[[BB24]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ]
 ; CHECK-NEXT:    br i1 false, label %[[BB4:.*]], label %[[BB11:.*]]
 ; CHECK:       [[BB4]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ zeroinitializer, %[[BB1]] ]
@@ -33,10 +33,8 @@ define i32 @test() {
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], <i32 poison, i32 poison, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP10:%.*]] = and <4 x i32> [[TMP9]], <i32 poison, i32 poison, i32 0, i32 -1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP17]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 poison, i32 poison, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP14]] = lshr <4 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 ; CHECK-NEXT:    br label %[[BB1]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
index a888027479817..edb69f5ca8293 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
@@ -51,14 +51,21 @@ entry:
 define void @test_add_udiv(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: @test_add_udiv(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARR1:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[A2:%.*]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], <i32 1, i32 1, i32 42, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr i32, ptr [[ARR1:%.*]], i32 2
+; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr i32, ptr [[ARR1]], i32 3
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4
+; CHECK-NEXT:    [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4
+; CHECK-NEXT:    [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARR1]], align 4
+; CHECK-NEXT:    [[RES2:%.*]] = udiv i32 [[V2]], [[Y2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A3:%.*]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> <i32 1146, i32 146, i32 0, i32 0>, [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = udiv <4 x i32> [[TMP0]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[RES2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1146, i32 146, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[V3]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP4]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[ARR2:%.*]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
index 3e9bd781bfea1..d85d5a61df53b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
@@ -3,9 +3,10 @@
 
 define void @test() {
 ; CHECK-LABEL: define void @test() {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i64> poison, i64 1, i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <14 x i64> <i64 undef, i64 undef, i64 0, i64 1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 0, i64 undef>, <14 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <14 x i64> [[TMP1]], <14 x i64> <i64 0, i64 0, i64 0, i64 0, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 14, i32 15, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <14 x i64> [[TMP2]], <14 x i64> <i64 0, i64 0, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <14 x i32> <i32 14, i32 15, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <14 x i64> [[TMP3]], <14 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = freeze <16 x i1> [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
index 2dd6b395597c3..1301907b2e032 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
@@ -5,20 +5,22 @@ define i16 @test() {
 ; CHECK-LABEL: define i16 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i32 0, 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[CALL99_I:%.*]] = call i32 @llvm.bswap.i32(i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[CALL99_I]], 0
 ; CHECK-NEXT:    [[CALL7_I45:%.*]] = tail call i32 null(i32 0)
-; CHECK-NEXT:    [[TMP8:%.*]] = lshr i32 [[CALL7_I45]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <28 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison>, i32 [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <28 x i32> [[TMP4]], i32 [[TMP2]], i32 5
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <28 x i32> [[TMP5]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 28, i32 29, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <28 x i32> [[TMP6]], i32 [[TMP8]], i32 12
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <28 x i32> [[TMP7]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <28 x i32> [[TMP16]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 28, i32 29, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <28 x i32> [[TMP9]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 28, i32 29, i32 30, i32 31, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT:    [[TMP11:%.*]] = and <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, [[TMP17]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <28 x i32> [[TMP11]], <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 0, 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 0, 0
+; CHECK-NEXT...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/181731