[llvm] 95ef1a5 - [SLP] Use the correct identity when combining binary opcodes with AND/MUL (#180457)

Thu Feb 12 09:34:53 PST 2026

Author: Ryan Buchner
Date: 2026-02-12T09:34:44-08:00
New Revision: 95ef1a5c3139c44bb171df225cb2b0aa17b50f1d

URL: https://github.com/llvm/llvm-project/commit/95ef1a5c3139c44bb171df225cb2b0aa17b50f1d
DIFF: https://github.com/llvm/llvm-project/commit/95ef1a5c3139c44bb171df225cb2b0aa17b50f1d.diff

LOG: [SLP] Use the correct identity when combining binary opcodes with AND/MUL (#180457)

Fixes #180456

Fix bug in the following SLP lowering:
```
define void @sub_mul(ptr %p, ptr %s) {
entry:
  %p1 = getelementptr i16, ptr %p, i64 1

  %l0 = load i16, ptr %p
  %l1 = load i16, ptr %p1

  %mul0 = sub i16 %l0, 0
  %mul1 = mul i16 %l1, 5

  %s1 = getelementptr i16, ptr %s, i64 1

  store i16 %mul0, ptr %s
  store i16 %mul1, ptr %s1
  ret void
}
```
to
```
define void @sub_mul(ptr %p, ptr %s) {
entry:
%tmp0 = load <2 x i16>, ptr %p, align 2
%tmp1 = mul <2 x i16> %tmp0, <i16 0, i16 5> -> updates to <i16 1, i16 5>
store <2 x i16> %tmp1, ptr %s, align 2
ret void
}
```

Added: 
    llvm/test/Transforms/SLPVectorizer/semanticly-same.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll
    llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
    llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8fb88a1fac0ef..324c5729c3f5d 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1080,55 +1080,56 @@ class BinOpSameOpcodeHelper {
       auto [CI, Pos] = isBinOpWithConstantInt(I);
       const APInt &FromCIValue = CI->getValue();
       unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
-      APInt ToCIValue;
+      Type *RHSType = I->getOperand(Pos)->getType();
+      Constant *RHS;
       switch (FromOpcode) {
       case Instruction::Shl:
         if (ToOpcode == Instruction::Mul) {
-          ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
-                                          FromCIValue.getZExtValue());
+          RHS = ConstantInt::get(
+              RHSType, APInt::getOneBitSet(FromCIValueBitWidth,
+                                           FromCIValue.getZExtValue()));
         } else {
           assert(FromCIValue.isZero() && "Cannot convert the instruction.");
-          ToCIValue = ToOpcode == Instruction::And
-                          ? APInt::getAllOnes(FromCIValueBitWidth)
-                          : APInt::getZero(FromCIValueBitWidth);
+          RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
+                                               /*AllowRHSConstant=*/true);
         }
         break;
       case Instruction::Mul:
         assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
         if (ToOpcode == Instruction::Shl) {
-          ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
+          RHS = ConstantInt::get(
+              RHSType, APInt(FromCIValueBitWidth, FromCIValue.logBase2()));
         } else {
           assert(FromCIValue.isOne() && "Cannot convert the instruction.");
-          ToCIValue = ToOpcode == Instruction::And
-                          ? APInt::getAllOnes(FromCIValueBitWidth)
-                          : APInt::getZero(FromCIValueBitWidth);
+          RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
+                                               /*AllowRHSConstant=*/true);
         }
         break;
       case Instruction::Add:
       case Instruction::Sub:
         if (FromCIValue.isZero()) {
-          ToCIValue = APInt::getZero(FromCIValueBitWidth);
+          RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
+                                               /*AllowRHSConstant=*/true);
         } else {
           assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
                  "Cannot convert the instruction.");
-          ToCIValue = FromCIValue;
-          ToCIValue.negate();
+          APInt NegatedVal = APInt(FromCIValue);
+          NegatedVal.negate();
+          RHS = ConstantInt::get(RHSType, NegatedVal);
         }
         break;
       case Instruction::And:
         assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
-        ToCIValue = ToOpcode == Instruction::Mul
-                        ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
-                        : APInt::getZero(FromCIValueBitWidth);
+        RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
+                                             /*AllowRHSConstant=*/true);
         break;
       default:
         assert(FromCIValue.isZero() && "Cannot convert the instruction.");
-        ToCIValue = APInt::getZero(FromCIValueBitWidth);
+        RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
+                                             /*AllowRHSConstant=*/true);
         break;
       }
       Value *LHS = I->getOperand(1 - Pos);
-      Constant *RHS =
-          ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
       // constant + x cannot be -constant - x
       // instead, it should be x - -constant
       if (Pos == 1 ||

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll
index 03a89e54e4212..aa424b9031e77 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll
@@ -13,7 +13,7 @@ define void @test() {
 ; CHECK:       [[BB6]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP8:%.*]], %[[BB6]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> <i32 1, i32 0, i32 poison, i32 poison>, <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 4>
 ; CHECK-NEXT:    [[TMP4]] = mul <4 x i32> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP8]] = mul <2 x i32> zeroinitializer, [[TMP7]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
index c4ddc5d63cc04..21a93e57f6ec6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
@@ -5,7 +5,7 @@ define i16 @test(i16 %v1, i16 %v2) {
 ; CHECK-LABEL: define i16 @test(
 ; CHECK-SAME: i16 [[V1:%.*]], i16 [[V2:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 poison>, i16 [[V2]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> <i16 -1, i16 -1, i16 -1, i16 poison>, i16 [[V2]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 poison>, i16 [[V1]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 poison, i32 3>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll b/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll
index f472f77c281ea..cb0fd96856262 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/copyable-child-node-used-outside.ll
@@ -10,7 +10,7 @@ define <4 x i32> @test() {
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[TRUNC]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>, i32 [[TRUNC]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> <i32 -1, i32 0, i32 0, i32 0>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> <i32 -1, i32 0, i32 -1, i32 -1>, [[TMP2]]
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[OR]] to i64
 ; CHECK-NEXT:    br label %[[BB3:.*]]
 ; CHECK:       [[BB3]]:

diff  --git a/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll b/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
new file mode 100644
index 0000000000000..c434fb5c97c07
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/semanticly-same.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-50 < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-50 < %s | FileCheck %s %}
+
+; Don't care about the profitability with these tests, just want to demonstrate the ability
+; to combine opcodes
+
+define void @sub_mul(ptr %p, ptr %s) {
+; CHECK-LABEL: define void @sub_mul(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[S:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i16> [[TMP0]], <i16 1, i16 5, i16 2, i16 3>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %mul0 = sub i16 %l0, 0
+  %mul1 = mul i16 %l1, 5
+  %mul2 = mul i16 %l2, 2
+  %mul3 = mul i16 %l3, 3
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %mul0, ptr %s
+  store i16 %mul1, ptr %s1
+  store i16 %mul2, ptr %s2
+  store i16 %mul3, ptr %s3
+  ret void
+}
+
+define void @add_mul(ptr %p, ptr %s) {
+; CHECK-LABEL: define void @add_mul(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[S:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i16> [[TMP0]], <i16 1, i16 5, i16 2, i16 3>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %mul0 = add i16 %l0, 0
+  %mul1 = mul i16 %l1, 5
+  %mul2 = mul i16 %l2, 2
+  %mul3 = mul i16 %l3, 3
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %mul0, ptr %s
+  store i16 %mul1, ptr %s1
+  store i16 %mul2, ptr %s2
+  store i16 %mul3, ptr %s3
+  ret void
+}
+
+define void @sub_and(ptr %p, ptr %s) {
+; CHECK-LABEL: define void @sub_and(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[S:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i16> [[TMP0]], <i16 -1, i16 5, i16 2, i16 3>
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[S]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr i16, ptr %p, i64 1
+  %p2 = getelementptr i16, ptr %p, i64 2
+  %p3 = getelementptr i16, ptr %p, i64 3
+
+  %l0 = load i16, ptr %p
+  %l1 = load i16, ptr %p1
+  %l2 = load i16, ptr %p2
+  %l3 = load i16, ptr %p3
+
+  %mul0 = add i16 %l0, 0
+  %mul1 = and i16 %l1, 5
+  %mul2 = and i16 %l2, 2
+  %mul3 = and i16 %l3, 3
+
+  %s1 = getelementptr i16, ptr %s, i64 1
+  %s2 = getelementptr i16, ptr %s, i64 2
+  %s3 = getelementptr i16, ptr %s, i64 3
+
+  store i16 %mul0, ptr %s
+  store i16 %mul1, ptr %s1
+  store i16 %mul2, ptr %s2
+  store i16 %mul3, ptr %s3
+  ret void
+}