[llvm] [PowerPC] Replace vspltisw+vadduwm instructions with xxleqv+vsubuwm for adding the vector {1, 1, 1, 1} (PR #160882)

Thu Nov 20 20:02:26 PST 2025

https://github.com/Himadhith updated https://github.com/llvm/llvm-project/pull/160882

>From cf815916f63d5db9b0431e5098c6dbaccc0e73af Mon Sep 17 00:00:00 2001
From: himadhith <himadhith.v at ibm.com>
Date: Fri, 26 Sep 2025 06:51:21 +0000
Subject: [PATCH 1/4] [PowerPC] Replace vspltisw instruction with xxleqv as
 generation of vector of -1s is cheaper than vector of 1s

---
 llvm/lib/Target/PowerPC/PPCInstrVSX.td | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 979ba31b0431b..fc00883528dc2 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3613,6 +3613,10 @@ def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
                                immSExt5NonZero:$A, immSExt5NonZero:$A)),
           (v4i32 (VSPLTISW imm:$A))>;
 
+// Optimize for vector of 1s addition operation
+def : Pat<(add v4i32:$A, (build_vector (i32 1), (i32 1), (i32 1), (i32 1))),
+          (VSUBUWM $A, (v4i32 (COPY_TO_REGCLASS (XXLEQVOnes), VSRC)))>;
+
 // Splat loads.
 def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)),
           (v8i16 (VSPLTHs 3, (MTVSRWZ (LHZX ForceXForm:$A))))>;

>From 73bd0ed9097e210ab92fcbdbb94c665a931a2ec2 Mon Sep 17 00:00:00 2001
From: himadhith <himadhith.v at ibm.com>
Date: Mon, 13 Oct 2025 12:46:26 +0000
Subject: [PATCH 2/4] DAG combiner method as tablegen does not work with v2i64s

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 40 +++++++++++++++++++++
 llvm/lib/Target/PowerPC/PPCInstrVSX.td      |  4 ---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8bf0d118da575..2bcce6004f0e2 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -19208,6 +19208,44 @@ static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
   return MatPCRel;
 }
 
+static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG,
+                               const PPCSubtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+
+  // Handle v2i64, v4i32, v8i16 and v16i8 types
+  if (!VT.isVector() || VT.getSizeInBits() != 128)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Check if RHS is BUILD_VECTOR
+  // To satisfy commutative property a+b = b+a
+  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
+    std::swap(LHS, RHS);
+
+  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // Check if all the elements are 1
+  unsigned NumOfEles = RHS.getNumOperands();
+  for (unsigned i = 0; i < NumOfEles; ++i) {
+    auto *CN = dyn_cast<ConstantSDNode>(RHS.getOperand(i));
+    if (!CN || CN->getSExtValue() != 1)
+      return SDValue();
+  }
+  SDLoc DL(N);
+
+  SDValue MinusOne = DAG.getConstant(APInt::getAllOnes(32), DL, MVT::i32);
+  SmallVector<SDValue, 4> Ops(4, MinusOne);
+  SDValue AllOnesVec = DAG.getBuildVector(MVT::v4i32, DL, Ops);
+
+  // Bitcast to the target vector type
+  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT, AllOnesVec);
+
+  return DAG.getNode(ISD::SUB, DL, VT, LHS, Bitcast);
+}
+
 SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
   if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
     return Value;
@@ -19215,6 +19253,8 @@ SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
   if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))
     return Value;
 
+  if (auto Value = combineADDToSUB(N, DCI.DAG, Subtarget))
+    return Value;
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index fc00883528dc2..979ba31b0431b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3613,10 +3613,6 @@ def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
                                immSExt5NonZero:$A, immSExt5NonZero:$A)),
           (v4i32 (VSPLTISW imm:$A))>;
 
-// Optimize for vector of 1s addition operation
-def : Pat<(add v4i32:$A, (build_vector (i32 1), (i32 1), (i32 1), (i32 1))),
-          (VSUBUWM $A, (v4i32 (COPY_TO_REGCLASS (XXLEQVOnes), VSRC)))>;
-
 // Splat loads.
 def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)),
           (v8i16 (VSPLTHs 3, (MTVSRWZ (LHZX ForceXForm:$A))))>;

>From d74869b2965328e696270c2ed3f55ebe29dcaaf1 Mon Sep 17 00:00:00 2001
From: himadhith <himadhith.v at ibm.com>
Date: Thu, 16 Oct 2025 13:49:47 +0000
Subject: [PATCH 3/4] update checks for affected files

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  3 ++-
 .../PowerPC/addition-vector-all-ones.ll       | 19 +++++++++----------
 .../CodeGen/PowerPC/vec_add_sub_doubleword.ll |  7 +++----
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2bcce6004f0e2..0102637945c98 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -19213,7 +19213,8 @@ static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
 
   // Handle v2i64, v4i32, v8i16 and v16i8 types
-  if (!VT.isVector() || VT.getSizeInBits() != 128)
+  if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
+        VT == MVT::v2i64))
     return SDValue();
 
   SDValue LHS = N->getOperand(0);
diff --git a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
index e67d031b1813f..4ec54fa8a0dee 100644
--- a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
+++ b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
@@ -8,15 +8,14 @@
 ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
 ; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
 
-; The addition of vector `A` with vector of 1s currently uses `vspltisw` to generate vector of 1s followed by add operation.
+; Optimized version which `xxleqv` and `vsubu` to generate vector of -1s to leverage the identity A - (-1) = A + 1.
 
 ; Function for the vector type v2i64 `a + {1, 1}`
 define <2 x i64> @test_v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: test_v2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vspltisw v3, 1
-; CHECK-NEXT:    vupklsw v3, v3
-; CHECK-NEXT:    vaddudm v2, v2, v3
+; CHECK-NEXT:    xxleqv v3, v3, v3
+; CHECK-NEXT:    vsubudm v2, v2, v3
 ; CHECK-NEXT:    blr
 entry:
   %add = add <2 x i64> %a, splat (i64 1)
@@ -27,8 +26,8 @@ entry:
 define <4 x i32> @test_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: test_v4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vspltisw v3, 1
-; CHECK-NEXT:    vadduwm v2, v2, v3
+; CHECK-NEXT:    xxleqv v3, v3, v3
+; CHECK-NEXT:    vsubuwm v2, v2, v3
 ; CHECK-NEXT:    blr
 entry:
   %add = add <4 x i32> %a, splat (i32 1)
@@ -39,8 +38,8 @@ entry:
 define <8 x i16> @test_v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: test_v8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vspltish v3, 1
-; CHECK-NEXT:    vadduhm v2, v2, v3
+; CHECK-NEXT:    xxleqv v3, v3, v3
+; CHECK-NEXT:    vsubuhm v2, v2, v3
 ; CHECK-NEXT:    blr
 entry:
   %add = add <8 x i16> %a, splat (i16 1)
@@ -51,8 +50,8 @@ entry:
 define <16 x i8> @test_16i8(<16 x i8> %a) {
 ; CHECK-LABEL: test_16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xxspltib v3, 1
-; CHECK-NEXT:    vaddubm v2, v2, v3
+; CHECK-NEXT:    xxleqv v3, v3, v3
+; CHECK-NEXT:    vsububm v2, v2, v3
 ; CHECK-NEXT:    blr
 entry:
   %add = add <16 x i8> %a, splat (i8 1)
diff --git a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll
index 210aee13486c3..033e0b76838df 100644
--- a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll
@@ -16,9 +16,8 @@ define <2 x i64> @test_add(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <2 x i64> @increment_by_one(<2 x i64> %x) nounwind {
 ; VSX-LABEL: increment_by_one:
 ; VSX:       # %bb.0:
-; VSX-NEXT:    vspltisw 3, 1
-; VSX-NEXT:    vupklsw 3, 3
-; VSX-NEXT:    vaddudm 2, 2, 3
+; VSX-NEXT:    xxleqv 35, 35, 35
+; VSX-NEXT:    vsubudm 2, 2, 3
 ; VSX-NEXT:    blr
 ;
 ; NOVSX-LABEL: increment_by_one:
@@ -26,7 +25,7 @@ define <2 x i64> @increment_by_one(<2 x i64> %x) nounwind {
 ; NOVSX-NEXT:    addis 3, 2, .LCPI1_0 at toc@ha
 ; NOVSX-NEXT:    addi 3, 3, .LCPI1_0 at toc@l
 ; NOVSX-NEXT:    lvx 3, 0, 3
-; NOVSX-NEXT:    vaddudm 2, 2, 3
+; NOVSX-NEXT:    vsubudm 2, 2, 3
 ; NOVSX-NEXT:    blr
   %result = add <2 x i64> %x, <i64 1, i64 1>
   ret <2 x i64> %result

>From 432d6e01111e6e7e83bbad5fc8991bdbf3023673 Mon Sep 17 00:00:00 2001
From: himadhith <himadhith.v at ibm.com>
Date: Fri, 17 Oct 2025 05:45:57 +0000
Subject: [PATCH 4/4] addressing review comments

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp         | 12 ++++++++----
 llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 0102637945c98..125c96c3f8008 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -19208,9 +19208,17 @@ static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
   return MatPCRel;
 }
 
+// Transform (add X, (build_vector (T 1), (T 1), ...)) -> (sub X, (XXLEQVOnes))
+// XXLEQVOnes creates an all-1s vector (0xFFFFFFFF...) efficiently via xxleqv
+// Mathematical identity: X + 1 = X - (-1)
+// Applies to v4i32, v2i64, v8i16, v16i8 where all elements are constant 1
+// Requirement: VSX feature for efficient xxleqv generation
 static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG,
                                const PPCSubtarget &Subtarget) {
+
   EVT VT = N->getValueType(0);
+  if (!Subtarget.hasVSX())
+    return SDValue();
 
   // Handle v2i64, v4i32, v8i16 and v16i8 types
   if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
@@ -19221,10 +19229,6 @@ static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG,
   SDValue RHS = N->getOperand(1);
 
   // Check if RHS is BUILD_VECTOR
-  // To satisfy commutative property a+b = b+a
-  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
-    std::swap(LHS, RHS);
-
   if (RHS.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
 
diff --git a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll
index 033e0b76838df..d56b1be539b05 100644
--- a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll
@@ -25,7 +25,7 @@ define <2 x i64> @increment_by_one(<2 x i64> %x) nounwind {
 ; NOVSX-NEXT:    addis 3, 2, .LCPI1_0 at toc@ha
 ; NOVSX-NEXT:    addi 3, 3, .LCPI1_0 at toc@l
 ; NOVSX-NEXT:    lvx 3, 0, 3
-; NOVSX-NEXT:    vsubudm 2, 2, 3
+; NOVSX-NEXT:    vaddudm 2, 2, 3
 ; NOVSX-NEXT:    blr
   %result = add <2 x i64> %x, <i64 1, i64 1>
   ret <2 x i64> %result