[llvm] [PowerPC] Replace vspltisw+vadduwm instructions with xxleqv+vsubuwm for adding the vector {1, 1, 1, 1} (PR #160882)

Fri Sep 26 22:49:56 PDT 2025

https://github.com/Himadhith updated https://github.com/llvm/llvm-project/pull/160882

>From 5de66e27a5d4c3b99491db23c9dde072a6a79158 Mon Sep 17 00:00:00 2001
From: himadhith <himadhith.v at ibm.com>
Date: Fri, 26 Sep 2025 06:51:21 +0000
Subject: [PATCH 1/2] [PowerPC] Replace vspltisw instruction with xxleqv as
 generation of vector of -1s is cheaper than vector of 1s

---
 llvm/lib/Target/PowerPC/PPCInstrVSX.td       |  4 +++
 llvm/test/CodeGen/PowerPC/vector-all-ones.ll | 32 ++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/vector-all-ones.ll

diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 4e5165bfcda55..a2fc7a0247e2b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3627,6 +3627,10 @@ def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
                                immSExt5NonZero:$A, immSExt5NonZero:$A)),
           (v4i32 (VSPLTISW imm:$A))>;
 
+// Optimize for vector of 1s addition operation
+def : Pat<(add v4i32:$A, (build_vector (i32 1), (i32 1), (i32 1), (i32 1))),
+          (VSUBUWM $A, (v4i32 (COPY_TO_REGCLASS (XXLEQVOnes), VSRC)))>;
+
 // Splat loads.
 def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)),
           (v8i16 (VSPLTHs 3, (MTVSRWZ (LHZX ForceXForm:$A))))>;
diff --git a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
new file mode 100644
index 0000000000000..01025834ea612
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64LE
+
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
+; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64
+
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
+; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_32
+
+define dso_local noundef <4 x i32> @test1(<4 x i32> %a) {
+; POWERPC_64LE-LABEL: test1:
+; POWERPC_64LE:       # %bb.0: # %entry
+; POWERPC_64LE-NEXT:    xxleqv v3, v3, v3
+; POWERPC_64LE-NEXT:    vsubuwm v2, v2, v3
+; POWERPC_64LE-NEXT:    blr
+;
+; POWERPC_64-LABEL: test1:
+; POWERPC_64:       # %bb.0: # %entry
+; POWERPC_64-NEXT:    xxleqv v3, v3, v3
+; POWERPC_64-NEXT:    vsubuwm v2, v2, v3
+; POWERPC_64-NEXT:    blr
+;
+; POWERPC_32-LABEL: test1:
+; POWERPC_32:       # %bb.0: # %entry
+; POWERPC_32-NEXT:    xxleqv v3, v3, v3
+; POWERPC_32-NEXT:    vsubuwm v2, v2, v3
+; POWERPC_32-NEXT:    blr
+entry:
+  %add = add <4 x i32> %a, splat (i32 1)
+  ret <4 x i32> %add
+}

>From 9d18c9fd6ceaa27f0f7d812d036e1aa8f0251354 Mon Sep 17 00:00:00 2001
From: himadhith <himadhith.v at ibm.com>
Date: Sat, 27 Sep 2025 05:56:03 +0000
Subject: [PATCH 2/2] Updating testfile

---
 llvm/test/CodeGen/PowerPC/vector-all-ones.ll | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
index e4c93adcf50a6..49c46d8eff726 100644
--- a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
@@ -8,14 +8,12 @@
 ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
 ; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
 
-; Currently the generated code uses `vspltisw` to generate vector of 1s followed by add operation.
-; This pattern is expected to be optimized in a future patch by using `xxleqv` to generate vector of -1s
-; followed by subtraction operation.
+; Optimized version of vector addition with {1,1,1,1} by replacing `vspltisw + vadduwm` with 'xxleqv + vsubuwm'
 define dso_local noundef <4 x i32> @test1(<4 x i32> %a) {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vspltisw v3, 1
-; CHECK-NEXT:    vadduwm v2, v2, v3
+; CHECK-NEXT:    xxleqv v3, v3, v3
+; CHECK-NEXT:    vsubuwm v2, v2, v3
 ; CHECK-NEXT:    blr
 entry:
   %add = add <4 x i32> %a, splat (i32 1)