[llvm] [VectorCombine] Optimize vector combine in fold binop of reduction (PR #179416)

Sun Feb 8 23:38:25 PST 2026

https://github.com/Anjian-Wen updated https://github.com/llvm/llvm-project/pull/179416

>From 8b345aabfb01136b19ca093dd2ca8f5c463660f2 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Tue, 3 Feb 2026 16:19:04 +0800
Subject: [PATCH 1/2] [RISCV] optimize vector combine in fold binop of
 reduction

move the reduction op close by simple commutative property of Addition
and Associative Property of Subtraction, which can pave the way for combining
the next two instructions in some case and saving a reduction instruction at last.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 34 +++++++++++++++++++
 .../VectorCombine/fold-binop-of-reductions.ll | 30 ++++++++++++++++
 2 files changed, 64 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 1746d3e4b06f4..1dca47e97988d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1754,6 +1754,40 @@ bool VectorCombine::foldBinopOfReductions(Instruction &I) {
     return nullptr;
   };
 
+  // sub (add (a, vector_reduce_add b), vector_reduce_add c) ->
+  // add (a, sub (vector_reduce_add b, vector_reduce_add c))
+  // sub (add (vector_reduce_add b, a), vector_reduce_add c) ->
+  // add (a, sub (vector_reduce_add b, vector_reduce_add c))
+  if (BinOpOpc == Instruction::Sub) {
+    auto *II = dyn_cast<BinaryOperator>(I.getOperand(0));
+    if (II && II->getOpcode() == Instruction::Add) {
+      Value *V1 =
+          checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
+      if (V1) {
+        Instruction *I0 = dyn_cast<Instruction>(I.getOperand(0));
+        Value *V00 =
+            checkIntrinsicAndGetItsArgument(I0->getOperand(0), ReductionIID);
+        Value *V01 =
+            checkIntrinsicAndGetItsArgument(I0->getOperand(1), ReductionIID);
+        if (V00) {
+          Value *NewSub =
+              Builder.CreateBinOp(BinOpOpc, I0->getOperand(0), I.getOperand(1));
+          Value *NewAdd =
+              Builder.CreateBinOp(Instruction::Add, I0->getOperand(1), NewSub);
+          replaceValue(I, *NewAdd);
+          return true;
+        } else if (V01) {
+          Value *NewSub =
+              Builder.CreateBinOp(BinOpOpc, I0->getOperand(1), I.getOperand(1));
+          Value *NewAdd =
+              Builder.CreateBinOp(Instruction::Add, I0->getOperand(0), NewSub);
+          replaceValue(I, *NewAdd);
+          return true;
+        }
+      }
+    }
+  }
+
   Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
   if (!V0)
     return false;
diff --git a/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll b/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
index 5f29af9de5a39..002f98990e197 100644
--- a/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
@@ -205,3 +205,33 @@ define i32 @element_counts_do_not_match_vscale(<vscale x 16 x i32> %v0, <vscale
   %res = add i32 %v0_red, %v1_red
   ret i32 %res
 }
+
+define i32 @sub_add_reduction_s_reduction(<vscale x 8 x i32> %v0, <vscale x 8 x i32> %v1, i32 %s1) {
+; CHECK-LABEL: define i32 @sub_add_reduction_s_reduction(
+; CHECK-SAME: <vscale x 8 x i32> [[V0:%.*]], <vscale x 8 x i32> [[V1:%.*]], i32 [[S1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <vscale x 8 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[TMP2]], [[S1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.add.v8i32(<vscale x 8 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.add.v8i32(<vscale x 8 x i32> %v1)
+  %add1 = add i32 %v0_red, %s1
+  %res = sub i32 %add1, %v1_red
+  ret i32 %res
+}
+
+define i32 @sub_add_s_reduction_reduction(<vscale x 8 x i32> %v0, <vscale x 8 x i32> %v1, i32 %s1) {
+; CHECK-LABEL: define i32 @sub_add_s_reduction_reduction(
+; CHECK-SAME: <vscale x 8 x i32> [[V0:%.*]], <vscale x 8 x i32> [[V1:%.*]], i32 [[S1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <vscale x 8 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[S1]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.add.v8i32(<vscale x 8 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.add.v8i32(<vscale x 8 x i32> %v1)
+  %add1 = add i32 %s1, %v0_red
+  %res = sub i32 %add1, %v1_red
+  ret i32 %res
+}

>From 0913941580820fe63b9f8870c7ce8e4b6b3dd369 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 9 Feb 2026 15:36:57 +0800
Subject: [PATCH 2/2] add more pattern and more test

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 63 ++++++++++++-------
 .../VectorCombine/fold-binop-of-reductions.ll | 30 +++++++++
 2 files changed, 70 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 1dca47e97988d..74bf337c1767a 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1754,34 +1754,51 @@ bool VectorCombine::foldBinopOfReductions(Instruction &I) {
     return nullptr;
   };
 
-  // sub (add (a, vector_reduce_add b), vector_reduce_add c) ->
-  // add (a, sub (vector_reduce_add b, vector_reduce_add c))
   // sub (add (vector_reduce_add b, a), vector_reduce_add c) ->
+  // add (sub (vector_reduce_add b, vector_reduce_add c), a)
+  //
+  // sub (sub (vector_reduce_add b, a), vector_reduce_add c) ->
+  // sub (sub (vector_reduce_add b, vector_reduce_add c), a)
+  //
+  // sub (sub (a, vector_reduce_add b), vector_reduce_add c) ->
+  // sub (a, add (vector_reduce_add b, vector_reduce_add c))
+  //
+  // sub (add (a, vector_reduce_add b), vector_reduce_add c) ->
   // add (a, sub (vector_reduce_add b, vector_reduce_add c))
   if (BinOpOpc == Instruction::Sub) {
     auto *II = dyn_cast<BinaryOperator>(I.getOperand(0));
-    if (II && II->getOpcode() == Instruction::Add) {
-      Value *V1 =
-          checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
-      if (V1) {
-        Instruction *I0 = dyn_cast<Instruction>(I.getOperand(0));
-        Value *V00 =
-            checkIntrinsicAndGetItsArgument(I0->getOperand(0), ReductionIID);
-        Value *V01 =
-            checkIntrinsicAndGetItsArgument(I0->getOperand(1), ReductionIID);
-        if (V00) {
-          Value *NewSub =
-              Builder.CreateBinOp(BinOpOpc, I0->getOperand(0), I.getOperand(1));
-          Value *NewAdd =
-              Builder.CreateBinOp(Instruction::Add, I0->getOperand(1), NewSub);
-          replaceValue(I, *NewAdd);
+    Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
+
+    if (II && V1 &&
+        (II->getOpcode() == Instruction::Add ||
+         II->getOpcode() == Instruction::Sub)) {
+      Instruction *I0 = dyn_cast<Instruction>(I.getOperand(0));
+      Value *V00 =
+          checkIntrinsicAndGetItsArgument(I0->getOperand(0), ReductionIID);
+      Value *V01 =
+          checkIntrinsicAndGetItsArgument(I0->getOperand(1), ReductionIID);
+
+      if (V00 && !V01) {
+        Value *CombineNode = Builder.CreateBinOp(
+            Instruction::Sub, I0->getOperand(0), I.getOperand(1));
+        Value *NewBinNode = Builder.CreateBinOp(II->getOpcode(), CombineNode,
+                                                I0->getOperand(1));
+        replaceValue(I, *NewBinNode);
+        return true;
+      } else if (V01 && !V00) {
+        if (II->getOpcode() == Instruction::Sub) {
+          Value *CombineNode = Builder.CreateBinOp(
+              Instruction::Add, I0->getOperand(1), I.getOperand(1));
+          Value *NewBinNode = Builder.CreateBinOp(
+              Instruction::Sub, I0->getOperand(0), CombineNode);
+          replaceValue(I, *NewBinNode);
           return true;
-        } else if (V01) {
-          Value *NewSub =
-              Builder.CreateBinOp(BinOpOpc, I0->getOperand(1), I.getOperand(1));
-          Value *NewAdd =
-              Builder.CreateBinOp(Instruction::Add, I0->getOperand(0), NewSub);
-          replaceValue(I, *NewAdd);
+        } else if (II->getOpcode() == Instruction::Add) {
+          Value *CombineNode = Builder.CreateBinOp(
+              Instruction::Sub, I0->getOperand(1), I.getOperand(1));
+          Value *NewBinNode = Builder.CreateBinOp(
+              Instruction::Add, I0->getOperand(0), CombineNode);
+          replaceValue(I, *NewBinNode);
           return true;
         }
       }
diff --git a/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll b/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
index 002f98990e197..22960119ce056 100644
--- a/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
@@ -235,3 +235,33 @@ define i32 @sub_add_s_reduction_reduction(<vscale x 8 x i32> %v0, <vscale x 8 x
   %res = sub i32 %add1, %v1_red
   ret i32 %res
 }
+
+define i32 @sub_sub_reduction_s_reduction(<vscale x 8 x i32> %v0, <vscale x 8 x i32> %v1, i32 %s1) {
+; CHECK-LABEL: define i32 @sub_sub_reduction_s_reduction(
+; CHECK-SAME: <vscale x 8 x i32> [[V0:%.*]], <vscale x 8 x i32> [[V1:%.*]], i32 [[S1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <vscale x 8 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[RES:%.*]] = sub i32 [[TMP2]], [[S1]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.add.v8i32(<vscale x 8 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.add.v8i32(<vscale x 8 x i32> %v1)
+  %sub1 = sub i32 %v0_red, %s1
+  %res = sub i32 %sub1, %v1_red
+  ret i32 %res
+}
+
+define i32 @sub_sub_s_reduction_reduction(<vscale x 8 x i32> %v0, <vscale x 8 x i32> %v1, i32 %s1) {
+; CHECK-LABEL: define i32 @sub_sub_s_reduction_reduction(
+; CHECK-SAME: <vscale x 8 x i32> [[V0:%.*]], <vscale x 8 x i32> [[V1:%.*]], i32 [[S1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add <vscale x 8 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[RES:%.*]] = sub i32 [[S1]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.add.v8i32(<vscale x 8 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.add.v8i32(<vscale x 8 x i32> %v1)
+  %sub1 = sub i32 %s1, %v0_red
+  %res = sub i32 %sub1, %v1_red
+  ret i32 %res
+}