[llvm] [SLP] Allow UDiv X, C <--> LShr X, log2(C) tranformations in BinOpSameOpcodeHelper (PR #181731)

Mon Feb 16 11:33:53 PST 2026

https://github.com/bababuck created https://github.com/llvm/llvm-project/pull/181731

`UDiv` instructions by a constant power of 2 are combined to `LShr` instructions prior to the SLP Vectorizer leading to suboptimal vectorization.

Prior to this change,
`clang -O3 -march=riscv64gcv -S`
```
void foo(unsigned * restrict A, unsigned * restrict B) {
  for (unsigned i = 2; i < 6; ++i) {
    A[i] = B[i] / i;
  }
}
```
lowers to
```
define dso_local void @foo(ptr noalias noundef writeonly captures(none) initializes((8, 24)) %A, ptr noalias noundef readonly captures(none) %B) local_unnamed_addr #0 {
entry:
  %arrayidx = getelementptr inbounds nuw i8, ptr %B, i64 8
  %arrayidx2 = getelementptr inbounds nuw i8, ptr %A, i64 8
  %0 = load <4 x i32>, ptr %arrayidx, align 4, !tbaa !6
  %1 = lshr <4 x i32> %0, <i32 1, i32 0, i32 2, i32 0>
  %2 = udiv <4 x i32> %1, <i32 1, i32 3, i32 1, i32 5>
  store <4 x i32> %2, ptr %arrayidx2, align 4, !tbaa !6
  ret void
}
```
with this change, improved to:
```
define dso_local void @foo(ptr noalias noundef writeonly captures(none) initializes((8, 24)) %A, ptr noalias noundef readonly captures(none) %B) local_unnamed_addr #0 {
entry:
  %arrayidx = getelementptr inbounds nuw i8, ptr %B, i64 8
  %arrayidx2 = getelementptr inbounds nuw i8, ptr %A, i64 8
  %0 = load <4 x i32>, ptr %arrayidx, align 4, !tbaa !6
  %1 = udiv <4 x i32> %0, <i32 2, i32 3, i32 4, i32 5>
  store <4 x i32> %1, ptr %arrayidx2, align 4, !tbaa !6
  ret void
}
```

>From 81e60fef9d588124a11b8bc665d26262d198e57e Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Sun, 8 Feb 2026 09:56:09 -0800
Subject: [PATCH 1/4] [SLP] Add tests for LShr-UDiv power of 2 vectorization

---
 .../SLPVectorizer/semanticly-same.ll          | 345 +++++++++++++++++-
 1 file changed, 328 insertions(+), 17 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll b/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
index c434fb5c97c07..ac0a219c9ae3b 100644
--- a/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
+++ b/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
@@ -1,17 +1,84 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-50 < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-50 < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-50 < %s | FileCheck %s --check-prefixes=CHECK,X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-50 < %s | FileCheck %s --check-prefixes=CHECK,AARCH64 %}
 
 ; Don't care about the profitability with these tests, just want to demonstrate the ability
 ; to combine opcodes
 
+define void @shl_add(ptr %p, ptr %s) {
+; CHECK-LABEL: @shl_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i16> [[TMP0]], <i16 3, i16 5, i16 0, i16 3>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %shl0 = shl i16 %l0, 3
+  %shl1 = shl i16 %l1, 5
+  %shl2 = add i16 %l2, 0
+  %shl3 = shl i16 %l3, 3
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %shl0, ptr %s
+  store i16 %shl1, ptr %s1
+  store i16 %shl2, ptr %s2
+  store i16 %shl3, ptr %s3
+  ret void
+}
+
+define void @add_shl(ptr %p, ptr %s) {
+; CHECK-LABEL: @add_shl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], <i16 0, i16 5, i16 2, i16 3>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %add0 = shl i16 %l0, 0
+  %add1 = add i16 %l1, 5
+  %add2 = add i16 %l2, 2
+  %add3 = add i16 %l3, 3
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %add0, ptr %s
+  store i16 %add1, ptr %s1
+  store i16 %add2, ptr %s2
+  store i16 %add3, ptr %s3
+  ret void
+}
+
 define void @sub_mul(ptr %p, ptr %s) {
-; CHECK-LABEL: define void @sub_mul(
-; CHECK-SAME: ptr [[P:%.*]], ptr [[S:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 2
+; CHECK-LABEL: @sub_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i16> [[TMP0]], <i16 1, i16 5, i16 2, i16 3>
-; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -41,12 +108,11 @@ entry:
 }
 
 define void @add_mul(ptr %p, ptr %s) {
-; CHECK-LABEL: define void @add_mul(
-; CHECK-SAME: ptr [[P:%.*]], ptr [[S:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 2
+; CHECK-LABEL: @add_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i16> [[TMP0]], <i16 1, i16 5, i16 2, i16 3>
-; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -76,12 +142,11 @@ entry:
 }
 
 define void @sub_and(ptr %p, ptr %s) {
-; CHECK-LABEL: define void @sub_and(
-; CHECK-SAME: ptr [[P:%.*]], ptr [[S:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 2
+; CHECK-LABEL: @sub_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i16> [[TMP0]], <i16 -1, i16 5, i16 2, i16 3>
-; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -109,3 +174,249 @@ entry:
   store i16 %mul3, ptr %s3
   ret void
 }
+
+define void @shl_mul(ptr %p, ptr %s) {
+; CHECK-LABEL: @shl_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i16> [[TMP0]], <i16 8, i16 5, i16 2, i16 3>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %mul0 = shl i16 %l0, 3
+  %mul1 = mul i16 %l1, 5
+  %mul2 = mul i16 %l2, 2
+  %mul3 = mul i16 %l3, 3
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %mul0, ptr %s
+  store i16 %mul1, ptr %s1
+  store i16 %mul2, ptr %s2
+  store i16 %mul3, ptr %s3
+  ret void
+}
+
+define void @lshr_div(ptr %p, ptr %s) {
+; CHECK-LABEL: @lshr_div(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i16> [[TMP0]], <i16 3, i16 0, i16 0, i16 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv <4 x i16> [[TMP2]], <i16 1, i16 5, i16 2, i16 3>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %div0 = lshr i16 %l0, 3
+  %div1 = udiv i16 %l1, 5
+  %div2 = udiv i16 %l2, 2
+  %div3 = udiv i16 %l3, 3
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %div0, ptr %s
+  store i16 %div1, ptr %s1
+  store i16 %div2, ptr %s2
+  store i16 %div3, ptr %s3
+  ret void
+}
+
+define void @div_lshr(ptr %p, ptr %s) {
+; CHECK-LABEL: @div_lshr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP0]], <i16 5, i16 1, i16 2, i16 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i16> [[TMP2]], <i16 0, i16 3, i16 0, i16 9>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %div0 = udiv i16 %l0, 5
+  %div1 = lshr i16 %l1, 3
+  %div2 = udiv i16 %l2, 2
+  %div3 = lshr i16 %l3, 9
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %div0, ptr %s
+  store i16 %div1, ptr %s1
+  store i16 %div2, ptr %s2
+  store i16 %div3, ptr %s3
+  ret void
+}
+
+define void @div_lshr_too_large(ptr %p, ptr %s) {
+; CHECK-LABEL: @div_lshr_too_large(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv <4 x i16> [[TMP0]], <i16 5, i16 1, i16 2, i16 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i16> [[TMP1]], <i16 0, i16 17, i16 0, i16 9>
+; CHECK-NEXT:    store <4 x i16> [[TMP2]], ptr [[S:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %div0 = udiv i16 %l0, 5
+  %div1 = lshr i16 %l1, 17
+  %div2 = udiv i16 %l2, 2
+  %div3 = lshr i16 %l3, 9
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %div0, ptr %s
+  store i16 %div1, ptr %s1
+  store i16 %div2, ptr %s2
+  store i16 %div3, ptr %s3
+  ret void
+}
+
+define void @lshr_div2(ptr %p, ptr %s) {
+; CHECK-LABEL: @lshr_div2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i16> [[TMP0]], <i16 3, i16 0, i16 0, i16 5>
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], <i16 1, i16 8, i16 2, i16 1>
+; CHECK-NEXT:    store <4 x i16> [[TMP2]], ptr [[S:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %div0 = lshr i16 %l0, 3
+  %div1 = udiv i16 %l1, 8
+  %div2 = udiv i16 %l2, 2
+  %div3 = lshr i16 %l3, 5
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %div0, ptr %s
+  store i16 %div1, ptr %s1
+  store i16 %div2, ptr %s2
+  store i16 %div3, ptr %s3
+  ret void
+}
+
+define void @div2_lshr(ptr %p, ptr %s) {
+; CHECK-LABEL: @div2_lshr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP0]], <i16 8, i16 1, i16 4, i16 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i16> [[TMP2]], <i16 0, i16 3, i16 0, i16 4>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %div0 = udiv i16 %l0, 8
+  %div1 = lshr i16 %l1, 3
+  %div2 = udiv i16 %l2, 4
+  %div3 = lshr i16 %l3, 4
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %div0, ptr %s
+  store i16 %div1, ptr %s1
+  store i16 %div2, ptr %s2
+  store i16 %div3, ptr %s3
+  ret void
+}
+
+define void @add_sub(ptr %p, ptr %s) {
+; CHECK-LABEL: @add_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], <i16 3, i16 5, i16 2, i16 3>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %add0 = sub i16 %l0, -3
+  %add1 = add i16 %l1, 5
+  %add2 = add i16 %l2, 2
+  %add3 = add i16 %l3, 3
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %add0, ptr %s
+  store i16 %add1, ptr %s1
+  store i16 %add2, ptr %s2
+  store i16 %add3, ptr %s3
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AARCH64: {{.*}}
+; X86: {{.*}}

>From 2d15788773feb9a82c58ddee87b4466d0ddf8a74 Mon Sep 17 00:00:00 2001
From: Ryan Buchner <buchner.ryan at gmail.com>
Date: Fri, 13 Feb 2026 16:34:49 -0800
Subject: [PATCH 2/4] [SLP] Support LShr in BinOpSameOpcodeHelper

LShr X, 0 is an identity.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 24 +++++++++-----
 .../RISCV/partial-vec-invalid-cost.ll         | 21 +++++--------
 .../X86/matching-insert-point-for-nodes.ll    | 10 +++---
 .../X86/non-power-of-2-subvectors-insert.ll   |  7 +++--
 .../X86/reschedule-only-scheduled.ll          | 26 +++++++++-------
 .../SLPVectorizer/semanticly-same.ll          | 31 ++++++++++++++-----
 6 files changed, 69 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b29367fa1543f..54696dfa2aa72 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -952,8 +952,9 @@ class BinOpSameOpcodeHelper {
   using MaskType = std::uint_fast16_t;
   /// Sort SupportedOp because it is used by binary_search.
   constexpr static std::initializer_list<unsigned> SupportedOp = {
-      Instruction::Add,  Instruction::Sub, Instruction::Mul, Instruction::Shl,
-      Instruction::AShr, Instruction::And, Instruction::Or,  Instruction::Xor};
+      Instruction::Add, Instruction::Sub,  Instruction::Mul,
+      Instruction::Shl, Instruction::LShr, Instruction::AShr,
+      Instruction::And, Instruction::Or,   Instruction::Xor};
   static_assert(llvm::is_sorted_constexpr(SupportedOp) &&
                 "SupportedOp is not sorted.");
   enum : MaskType {
@@ -965,7 +966,8 @@ class BinOpSameOpcodeHelper {
     AndBIT = 0b100000,
     OrBIT = 0b1000000,
     XorBIT = 0b10000000,
-    MainOpBIT = 0b100000000,
+    LShrBIT = 0b100000000,
+    MainOpBIT = 0b1000000000,
     LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
   };
   /// Return a non-nullptr if either operand of I is a ConstantInt.
@@ -982,7 +984,7 @@ class BinOpSameOpcodeHelper {
     if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
       return {CI, 1};
     if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
-        Opcode == Instruction::AShr)
+        Opcode == Instruction::AShr || Opcode == Instruction::LShr)
       return {nullptr, 0};
     if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
       return {CI, 0};
@@ -992,7 +994,7 @@ class BinOpSameOpcodeHelper {
     const Instruction *I = nullptr;
     /// The bit it sets represents whether MainOp can be converted to.
     MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
-                    MulBIT | AShrBIT | ShlBIT;
+                    MulBIT | AShrBIT | ShlBIT | LShrBIT;
     /// We cannot create an interchangeable instruction that does not exist in
     /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
     /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
@@ -1033,6 +1035,8 @@ class BinOpSameOpcodeHelper {
         return Instruction::Or;
       if (Candidate & XorBIT)
         return Instruction::Xor;
+      if (Candidate & LShrBIT)
+        return Instruction::LShr;
       llvm_unreachable("Cannot find interchangeable instruction.");
     }
 
@@ -1057,6 +1061,7 @@ class BinOpSameOpcodeHelper {
       case Instruction::Xor:
         return Candidate & XorBIT;
       case Instruction::LShr:
+        return Candidate & LShrBIT;
       case Instruction::FAdd:
       case Instruction::FSub:
       case Instruction::FMul:
@@ -1166,7 +1171,7 @@ class BinOpSameOpcodeHelper {
            "BinOpSameOpcodeHelper only accepts BinaryOperator.");
     unsigned Opcode = I->getOpcode();
     MaskType OpcodeInMaskForm;
-    // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
+    // Prefer Shl, AShr, Mul, Add, Sub, And, Or, Xor, and LShr over MainOp.
     switch (Opcode) {
     case Instruction::Shl:
       OpcodeInMaskForm = ShlBIT;
@@ -1192,6 +1197,9 @@ class BinOpSameOpcodeHelper {
     case Instruction::Xor:
       OpcodeInMaskForm = XorBIT;
       break;
+    case Instruction::LShr:
+      OpcodeInMaskForm = LShrBIT;
+      break;
     default:
       return MainOp.equal(Opcode) ||
              (initializeAltOp(I) && AltOp.equal(Opcode));
@@ -1199,8 +1207,8 @@ class BinOpSameOpcodeHelper {
     MaskType InterchangeableMask = OpcodeInMaskForm;
     ConstantInt *CI = isBinOpWithConstantInt(I).first;
     if (CI) {
-      constexpr MaskType CanBeAll =
-          XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
+      constexpr MaskType CanBeAll = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
+                                    MulBIT | AShrBIT | ShlBIT | LShrBIT;
       const APInt &CIValue = CI->getValue();
       switch (Opcode) {
       case Instruction::Shl:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll
index 2b79ca9429fa3..3c6ea5a7cce87 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll
@@ -7,25 +7,18 @@ define void @partial_vec_invalid_cost() #0 {
 ; CHECK-LABEL: define void @partial_vec_invalid_cost(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LSHR_1:%.*]] = lshr i96 0, 0
-; CHECK-NEXT:    [[LSHR_2:%.*]] = lshr i96 0, 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i96> poison, i96 [[LSHR_1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i96> [[TMP0]], i96 [[LSHR_2]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i96> [[TMP1]], i96 0, i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i96> [[TMP2]], i96 0, i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i96> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[RDX_OP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[RDX_OP]])
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[STORE_THIS:%.*]] = zext i32 [[OP_RDX3]] to i96
 ; CHECK-NEXT:    store i96 [[STORE_THIS]], ptr null, align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
-
-  %lshr.1 = lshr i96 0, 0 ; These ops
-  %lshr.2 = lshr i96 0, 0 ; return an
-  %add.0 = add i96 0, 0   ; invalid
-  %add.1 = add i96 0, 0   ; vector cost.
+  ; Test is broken, I don't think there are any pairs of binary ops that
+  ; can give an invalid cost
+  %lshr.1 = lshr i96 0, 0
+  %lshr.2 = lshr i96 0, 0
+  %add.0 = add i96 0, 0
+  %add.1 = add i96 0, 0
 
   %trunc.i96.1 = trunc i96 %lshr.1 to i32 ; These ops
   %trunc.i96.2 = trunc i96 %lshr.2 to i32 ; return an
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll
index 5e85ecd610ebd..8a6705295df81 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matching-insert-point-for-nodes.ll
@@ -6,8 +6,8 @@ define i32 @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], %[[BB24:.*]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <4 x i32> [ [[TMP17:%.*]], %[[BB24]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ [[TMP14:%.*]], %[[BB24:.*]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <4 x i32> [ [[TMP13:%.*]], %[[BB24]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB]] ]
 ; CHECK-NEXT:    br i1 false, label %[[BB4:.*]], label %[[BB11:.*]]
 ; CHECK:       [[BB4]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ zeroinitializer, %[[BB1]] ]
@@ -33,10 +33,8 @@ define i32 @test() {
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], <i32 poison, i32 poison, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP10:%.*]] = and <4 x i32> [[TMP9]], <i32 poison, i32 poison, i32 0, i32 -1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP17]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 poison, i32 poison, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP14]] = lshr <4 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 ; CHECK-NEXT:    br label %[[BB1]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
index 3e9bd781bfea1..d85d5a61df53b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
@@ -3,9 +3,10 @@
 
 define void @test() {
 ; CHECK-LABEL: define void @test() {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i64> poison, i64 1, i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <14 x i64> <i64 undef, i64 undef, i64 0, i64 1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 0, i64 undef>, <14 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <14 x i64> [[TMP1]], <14 x i64> <i64 0, i64 0, i64 0, i64 0, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 14, i32 15, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <14 x i64> [[TMP2]], <14 x i64> <i64 0, i64 0, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <14 x i32> <i32 14, i32 15, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <14 x i64> [[TMP3]], <14 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = freeze <16 x i1> [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
index 2dd6b395597c3..1301907b2e032 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
@@ -5,20 +5,22 @@ define i16 @test() {
 ; CHECK-LABEL: define i16 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i32 0, 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shl i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[CALL99_I:%.*]] = call i32 @llvm.bswap.i32(i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[CALL99_I]], 0
 ; CHECK-NEXT:    [[CALL7_I45:%.*]] = tail call i32 null(i32 0)
-; CHECK-NEXT:    [[TMP8:%.*]] = lshr i32 [[CALL7_I45]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <28 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison>, i32 [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <28 x i32> [[TMP4]], i32 [[TMP2]], i32 5
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <28 x i32> [[TMP5]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 28, i32 29, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <28 x i32> [[TMP6]], i32 [[TMP8]], i32 12
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <28 x i32> [[TMP7]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <28 x i32> [[TMP16]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 28, i32 29, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <28 x i32> [[TMP9]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 28, i32 29, i32 30, i32 31, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT:    [[TMP11:%.*]] = and <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, [[TMP17]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <28 x i32> [[TMP11]], <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 0, 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 0, 0
+; CHECK-NEXT:    [[UNSCLEAR78_I:%.*]] = and i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <12 x i32> <i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 0, i32 0>, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <12 x i32> [[TMP3]], i32 [[CALL99_I]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <12 x i32> [[TMP4]], i32 [[CALL7_I45]], i32 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <12 x i32> [[TMP5]], i32 [[TMP1]], i32 9
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <12 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <12 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <12 x i32> [[TMP8]], <12 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <32 x i32> [[TMP9]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 poison, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <32 x i32> [[TMP10]], i32 [[UNSCLEAR78_I]], i32 16
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 32, i32 33, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP16]], <32 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>, [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x i1> [[TMP13]] to i32
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP14]])
diff --git a/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll b/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
index ac0a219c9ae3b..4e7a8270bde78 100644
--- a/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
+++ b/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
@@ -315,13 +315,30 @@ entry:
 }
 
 define void @lshr_div2(ptr %p, ptr %s) {
-; CHECK-LABEL: @lshr_div2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i16> [[TMP0]], <i16 3, i16 0, i16 0, i16 5>
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], <i16 1, i16 8, i16 2, i16 1>
-; CHECK-NEXT:    store <4 x i16> [[TMP2]], ptr [[S:%.*]], align 2
-; CHECK-NEXT:    ret void
+; X86-LABEL: @lshr_div2(
+; X86-NEXT:  entry:
+; X86-NEXT:    [[P1:%.*]] = getelementptr i16, ptr [[P:%.*]], i64 1
+; X86-NEXT:    [[P3:%.*]] = getelementptr i16, ptr [[P]], i64 3
+; X86-NEXT:    [[L0:%.*]] = load i16, ptr [[P]], align 2
+; X86-NEXT:    [[L3:%.*]] = load i16, ptr [[P3]], align 2
+; X86-NEXT:    [[DIV0:%.*]] = lshr i16 [[L0]], 3
+; X86-NEXT:    [[DIV3:%.*]] = lshr i16 [[L3]], 5
+; X86-NEXT:    [[S1:%.*]] = getelementptr i16, ptr [[S:%.*]], i64 1
+; X86-NEXT:    [[S3:%.*]] = getelementptr i16, ptr [[S]], i64 3
+; X86-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P1]], align 2
+; X86-NEXT:    [[TMP1:%.*]] = udiv <2 x i16> [[TMP0]], <i16 8, i16 2>
+; X86-NEXT:    store i16 [[DIV0]], ptr [[S]], align 2
+; X86-NEXT:    store <2 x i16> [[TMP1]], ptr [[S1]], align 2
+; X86-NEXT:    store i16 [[DIV3]], ptr [[S3]], align 2
+; X86-NEXT:    ret void
+;
+; AARCH64-LABEL: @lshr_div2(
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; AARCH64-NEXT:    [[TMP1:%.*]] = lshr <4 x i16> [[TMP0]], <i16 3, i16 0, i16 0, i16 5>
+; AARCH64-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], <i16 1, i16 8, i16 2, i16 1>
+; AARCH64-NEXT:    store <4 x i16> [[TMP2]], ptr [[S:%.*]], align 2
+; AARCH64-NEXT:    ret void
 ;
 entry:
   %p1 = getelementptr i16, ptr %p, i64 1

>From a4d92119b69a661cf5693b582cf6af80bc1a7772 Mon Sep 17 00:00:00 2001
From: Ryan Buchner <buchner.ryan at gmail.com>
Date: Fri, 13 Feb 2026 16:49:49 -0800
Subject: [PATCH 3/4] [SLP] Support UDiv in BinOpSameOpcodeHelper

UDiv X, 1 is an identity
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 29 +++++++++++++------
 .../SLPVectorizer/X86/no_alternate_divrem.ll  | 19 ++++++++----
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 54696dfa2aa72..4d630f4a0412e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -952,9 +952,9 @@ class BinOpSameOpcodeHelper {
   using MaskType = std::uint_fast16_t;
   /// Sort SupportedOp because it is used by binary_search.
   constexpr static std::initializer_list<unsigned> SupportedOp = {
-      Instruction::Add, Instruction::Sub,  Instruction::Mul,
-      Instruction::Shl, Instruction::LShr, Instruction::AShr,
-      Instruction::And, Instruction::Or,   Instruction::Xor};
+      Instruction::Add, Instruction::Sub,  Instruction::Mul,  Instruction::UDiv,
+      Instruction::Shl, Instruction::LShr, Instruction::AShr, Instruction::And,
+      Instruction::Or,  Instruction::Xor};
   static_assert(llvm::is_sorted_constexpr(SupportedOp) &&
                 "SupportedOp is not sorted.");
   enum : MaskType {
@@ -967,7 +967,8 @@ class BinOpSameOpcodeHelper {
     OrBIT = 0b1000000,
     XorBIT = 0b10000000,
     LShrBIT = 0b100000000,
-    MainOpBIT = 0b1000000000,
+    UDivBIT = 0b1000000000,
+    MainOpBIT = 0b10000000000,
     LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
   };
   /// Return a non-nullptr if either operand of I is a ConstantInt.
@@ -984,7 +985,8 @@ class BinOpSameOpcodeHelper {
     if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
       return {CI, 1};
     if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
-        Opcode == Instruction::AShr || Opcode == Instruction::LShr)
+        Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
+        Opcode == Instruction::UDiv)
       return {nullptr, 0};
     if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
       return {CI, 0};
@@ -994,7 +996,7 @@ class BinOpSameOpcodeHelper {
     const Instruction *I = nullptr;
     /// The bit it sets represents whether MainOp can be converted to.
     MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
-                    MulBIT | AShrBIT | ShlBIT | LShrBIT;
+                    MulBIT | AShrBIT | ShlBIT | LShrBIT | UDivBIT;
     /// We cannot create an interchangeable instruction that does not exist in
     /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
     /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
@@ -1037,6 +1039,8 @@ class BinOpSameOpcodeHelper {
         return Instruction::Xor;
       if (Candidate & LShrBIT)
         return Instruction::LShr;
+      if (Candidate & UDivBIT)
+        return Instruction::UDiv;
       llvm_unreachable("Cannot find interchangeable instruction.");
     }
 
@@ -1062,11 +1066,12 @@ class BinOpSameOpcodeHelper {
         return Candidate & XorBIT;
       case Instruction::LShr:
         return Candidate & LShrBIT;
+      case Instruction::UDiv:
+        return Candidate & UDivBIT;
       case Instruction::FAdd:
       case Instruction::FSub:
       case Instruction::FMul:
       case Instruction::SDiv:
-      case Instruction::UDiv:
       case Instruction::FDiv:
       case Instruction::SRem:
       case Instruction::URem:
@@ -1102,6 +1107,7 @@ class BinOpSameOpcodeHelper {
         }
         break;
       case Instruction::Mul:
+      case Instruction::UDiv:
         assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
         if (ToOpcode == Instruction::Shl) {
           RHS = ConstantInt::get(
@@ -1171,7 +1177,8 @@ class BinOpSameOpcodeHelper {
            "BinOpSameOpcodeHelper only accepts BinaryOperator.");
     unsigned Opcode = I->getOpcode();
     MaskType OpcodeInMaskForm;
-    // Prefer Shl, AShr, Mul, Add, Sub, And, Or, Xor, and LShr over MainOp.
+    // Prefer Shl, AShr, Mul, Add, Sub, And, Or, Xor, LShr, and UDiv over
+    // MainOp.
     switch (Opcode) {
     case Instruction::Shl:
       OpcodeInMaskForm = ShlBIT;
@@ -1200,6 +1207,9 @@ class BinOpSameOpcodeHelper {
     case Instruction::LShr:
       OpcodeInMaskForm = LShrBIT;
       break;
+    case Instruction::UDiv:
+      OpcodeInMaskForm = UDivBIT;
+      break;
     default:
       return MainOp.equal(Opcode) ||
              (initializeAltOp(I) && AltOp.equal(Opcode));
@@ -1208,7 +1218,8 @@ class BinOpSameOpcodeHelper {
     ConstantInt *CI = isBinOpWithConstantInt(I).first;
     if (CI) {
       constexpr MaskType CanBeAll = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
-                                    MulBIT | AShrBIT | ShlBIT | LShrBIT;
+                                    MulBIT | AShrBIT | ShlBIT | LShrBIT |
+                                    UDivBIT;
       const APInt &CIValue = CI->getValue();
       switch (Opcode) {
       case Instruction::Shl:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
index a888027479817..edb69f5ca8293 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
@@ -51,14 +51,21 @@ entry:
 define void @test_add_udiv(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: @test_add_udiv(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARR1:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[A2:%.*]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], <i32 1, i32 1, i32 42, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr i32, ptr [[ARR1:%.*]], i32 2
+; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr i32, ptr [[ARR1]], i32 3
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4
+; CHECK-NEXT:    [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4
+; CHECK-NEXT:    [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARR1]], align 4
+; CHECK-NEXT:    [[RES2:%.*]] = udiv i32 [[V2]], [[Y2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A3:%.*]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> <i32 1146, i32 146, i32 0, i32 0>, [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = udiv <4 x i32> [[TMP0]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[RES2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1146, i32 146, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[V3]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP4]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[ARR2:%.*]], align 4
 ; CHECK-NEXT:    ret void

>From 7599076ee25a15c2e8f6f7db0a2ca9561e175dd5 Mon Sep 17 00:00:00 2001
From: Ryan Buchner <buchner.ryan at gmail.com>
Date: Fri, 13 Feb 2026 16:52:59 -0800
Subject: [PATCH 4/4] [SLP] Allow UDiv X, C <--> LShr X, log2(C) tranformations
 in BinOpSameOpcodeHelper

When C is a power of 2.

Important since InstCombine may convert a UDiv to LShr.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 18 ++++++--
 .../SLPVectorizer/semanticly-same.ll          | 43 +++++--------------
 2 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4d630f4a0412e..dd4bae01629a3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1096,7 +1096,8 @@ class BinOpSameOpcodeHelper {
       Constant *RHS;
       switch (FromOpcode) {
       case Instruction::Shl:
-        if (ToOpcode == Instruction::Mul) {
+      case Instruction::LShr:
+        if (ToOpcode == Instruction::Mul || ToOpcode == Instruction::UDiv) {
           RHS = ConstantInt::get(
               RHSType, APInt::getOneBitSet(FromCIValueBitWidth,
                                            FromCIValue.getZExtValue()));
@@ -1109,7 +1110,7 @@ class BinOpSameOpcodeHelper {
       case Instruction::Mul:
       case Instruction::UDiv:
         assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
-        if (ToOpcode == Instruction::Shl) {
+        if (ToOpcode == Instruction::Shl || ToOpcode == Instruction::LShr) {
           RHS = ConstantInt::get(
               RHSType, APInt(FromCIValueBitWidth, FromCIValue.logBase2()));
         } else {
@@ -1226,13 +1227,22 @@ class BinOpSameOpcodeHelper {
         if (CIValue.ult(CIValue.getBitWidth()))
           InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
         break;
+      case Instruction::LShr:
+        if (CIValue.ult(CIValue.getBitWidth()))
+          InterchangeableMask = CIValue.isZero() ? CanBeAll : UDivBIT | LShrBIT;
+        break;
       case Instruction::Mul:
+      case Instruction::UDiv:
         if (CIValue.isOne()) {
           InterchangeableMask = CanBeAll;
           break;
         }
-        if (CIValue.isPowerOf2())
-          InterchangeableMask = MulBIT | ShlBIT;
+        if (CIValue.isPowerOf2()) {
+          if (Opcode == Instruction::Mul)
+            InterchangeableMask = MulBIT | ShlBIT;
+          else // Instruction::UDiv
+            InterchangeableMask = UDivBIT | LShrBIT;
+        }
         break;
       case Instruction::Add:
       case Instruction::Sub:
diff --git a/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll b/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
index 4e7a8270bde78..8fb52eedd9574 100644
--- a/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
+++ b/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
@@ -213,8 +213,7 @@ define void @lshr_div(ptr %p, ptr %s) {
 ; CHECK-LABEL: @lshr_div(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i16> [[TMP0]], <i16 3, i16 0, i16 0, i16 0>
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv <4 x i16> [[TMP2]], <i16 1, i16 5, i16 2, i16 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv <4 x i16> [[TMP0]], <i16 8, i16 5, i16 2, i16 3>
 ; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
@@ -248,8 +247,7 @@ define void @div_lshr(ptr %p, ptr %s) {
 ; CHECK-LABEL: @div_lshr(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP0]], <i16 5, i16 1, i16 2, i16 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i16> [[TMP2]], <i16 0, i16 3, i16 0, i16 9>
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv <4 x i16> [[TMP0]], <i16 5, i16 8, i16 2, i16 512>
 ; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
@@ -283,8 +281,8 @@ define void @div_lshr_too_large(ptr %p, ptr %s) {
 ; CHECK-LABEL: @div_lshr_too_large(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv <4 x i16> [[TMP0]], <i16 5, i16 1, i16 2, i16 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i16> [[TMP1]], <i16 0, i16 17, i16 0, i16 9>
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv <4 x i16> [[TMP0]], <i16 5, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i16> [[TMP1]], <i16 0, i16 17, i16 1, i16 9>
 ; CHECK-NEXT:    store <4 x i16> [[TMP2]], ptr [[S:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
@@ -315,30 +313,12 @@ entry:
 }
 
 define void @lshr_div2(ptr %p, ptr %s) {
-; X86-LABEL: @lshr_div2(
-; X86-NEXT:  entry:
-; X86-NEXT:    [[P1:%.*]] = getelementptr i16, ptr [[P:%.*]], i64 1
-; X86-NEXT:    [[P3:%.*]] = getelementptr i16, ptr [[P]], i64 3
-; X86-NEXT:    [[L0:%.*]] = load i16, ptr [[P]], align 2
-; X86-NEXT:    [[L3:%.*]] = load i16, ptr [[P3]], align 2
-; X86-NEXT:    [[DIV0:%.*]] = lshr i16 [[L0]], 3
-; X86-NEXT:    [[DIV3:%.*]] = lshr i16 [[L3]], 5
-; X86-NEXT:    [[S1:%.*]] = getelementptr i16, ptr [[S:%.*]], i64 1
-; X86-NEXT:    [[S3:%.*]] = getelementptr i16, ptr [[S]], i64 3
-; X86-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[P1]], align 2
-; X86-NEXT:    [[TMP1:%.*]] = udiv <2 x i16> [[TMP0]], <i16 8, i16 2>
-; X86-NEXT:    store i16 [[DIV0]], ptr [[S]], align 2
-; X86-NEXT:    store <2 x i16> [[TMP1]], ptr [[S1]], align 2
-; X86-NEXT:    store i16 [[DIV3]], ptr [[S3]], align 2
-; X86-NEXT:    ret void
-;
-; AARCH64-LABEL: @lshr_div2(
-; AARCH64-NEXT:  entry:
-; AARCH64-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
-; AARCH64-NEXT:    [[TMP1:%.*]] = lshr <4 x i16> [[TMP0]], <i16 3, i16 0, i16 0, i16 5>
-; AARCH64-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], <i16 1, i16 8, i16 2, i16 1>
-; AARCH64-NEXT:    store <4 x i16> [[TMP2]], ptr [[S:%.*]], align 2
-; AARCH64-NEXT:    ret void
+; CHECK-LABEL: @lshr_div2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i16> [[TMP0]], <i16 3, i16 3, i16 1, i16 5>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
+; CHECK-NEXT:    ret void
 ;
 entry:
   %p1 = getelementptr i16, ptr %p, i64 1
@@ -370,8 +350,7 @@ define void @div2_lshr(ptr %p, ptr %s) {
 ; CHECK-LABEL: @div2_lshr(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P:%.*]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP0]], <i16 8, i16 1, i16 4, i16 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i16> [[TMP2]], <i16 0, i16 3, i16 0, i16 4>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i16> [[TMP0]], <i16 3, i16 3, i16 2, i16 4>
 ; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;