[llvm] c3e7a1a - [NFC][PowerPC] Optimize vector compares for not equal to non zero vectors (#171635)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 21:53:19 PST 2025
Author: Himadhith
Date: 2025-12-12T11:23:14+05:30
New Revision: c3e7a1ab8f60c797e320687a92a7a59af62dc242
URL: https://github.com/llvm/llvm-project/commit/c3e7a1ab8f60c797e320687a92a7a59af62dc242
DIFF: https://github.com/llvm/llvm-project/commit/c3e7a1ab8f60c797e320687a92a7a59af62dc242.diff
LOG: [NFC][PowerPC] Optimize vector compares for not equal to non zero vectors (#171635)
Lockdown instructions for vector compares `not equal to non-zero (Ex:
vec[i]!=7)`. Current implementation can be made better by removing the
negation and using the identity ``` 0XFFFF + 1 = 0 and 0 + 1 = 0 ```
Co-authored-by: himadhith <himadhith.v at ibm.com>
Added:
llvm/test/CodeGen/PowerPC/optimize-vector-not-equal.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/PowerPC/optimize-vector-not-equal.ll b/llvm/test/CodeGen/PowerPC/optimize-vector-not-equal.ll
new file mode 100644
index 0000000000000..bfb9ab3356f48
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/optimize-vector-not-equal.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64LE
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_32
+
+; The current implementation is comparing vector of non-zeros in register v2 with v3. v3 is then negated and converts:
+; 0XFFFF -> 0
+; 0 -> 1
+; An optimized version is to follow this NFC patch
+
+define i32 @cols_needed(<4 x i16> %wide.load) {
+; POWERPC_64LE-LABEL: cols_needed:
+; POWERPC_64LE: # %bb.0: # %entry
+; POWERPC_64LE-NEXT: xxlxor v3, v3, v3
+; POWERPC_64LE-NEXT: li r3, 0
+; POWERPC_64LE-NEXT: vcmpequh v2, v2, v3
+; POWERPC_64LE-NEXT: vspltisw v3, 1
+; POWERPC_64LE-NEXT: xxlnor v2, v2, v2
+; POWERPC_64LE-NEXT: vmrglh v2, v2, v2
+; POWERPC_64LE-NEXT: xxland v2, v2, v3
+; POWERPC_64LE-NEXT: xxswapd v3, v2
+; POWERPC_64LE-NEXT: vadduwm v2, v2, v3
+; POWERPC_64LE-NEXT: xxspltw v3, v2, 2
+; POWERPC_64LE-NEXT: vadduwm v2, v2, v3
+; POWERPC_64LE-NEXT: vextuwrx r3, r3, v2
+; POWERPC_64LE-NEXT: blr
+;
+; POWERPC_64-LABEL: cols_needed:
+; POWERPC_64: # %bb.0: # %entry
+; POWERPC_64-NEXT: xxlxor v3, v3, v3
+; POWERPC_64-NEXT: li r3, 0
+; POWERPC_64-NEXT: vcmpequh v2, v2, v3
+; POWERPC_64-NEXT: vspltisw v3, 1
+; POWERPC_64-NEXT: xxlnor v2, v2, v2
+; POWERPC_64-NEXT: vmrghh v2, v2, v2
+; POWERPC_64-NEXT: xxland v2, v2, v3
+; POWERPC_64-NEXT: xxswapd v3, v2
+; POWERPC_64-NEXT: vadduwm v2, v2, v3
+; POWERPC_64-NEXT: xxspltw v3, v2, 1
+; POWERPC_64-NEXT: vadduwm v2, v2, v3
+; POWERPC_64-NEXT: vextuwlx r3, r3, v2
+; POWERPC_64-NEXT: blr
+;
+; POWERPC_32-LABEL: cols_needed:
+; POWERPC_32: # %bb.0: # %entry
+; POWERPC_32-NEXT: xxlxor v3, v3, v3
+; POWERPC_32-NEXT: vcmpequh v2, v2, v3
+; POWERPC_32-NEXT: vspltisw v3, 1
+; POWERPC_32-NEXT: xxlnor v2, v2, v2
+; POWERPC_32-NEXT: vmrghh v2, v2, v2
+; POWERPC_32-NEXT: xxland v2, v2, v3
+; POWERPC_32-NEXT: xxswapd v3, v2
+; POWERPC_32-NEXT: vadduwm v2, v2, v3
+; POWERPC_32-NEXT: xxspltw v3, v2, 1
+; POWERPC_32-NEXT: vadduwm v2, v2, v3
+; POWERPC_32-NEXT: stxv v2, -16(r1)
+; POWERPC_32-NEXT: lwz r3, -16(r1)
+; POWERPC_32-NEXT: blr
+entry:
+ %0 = icmp ne <4 x i16> %wide.load, zeroinitializer
+ %1 = zext <4 x i1> %0 to <4 x i32>
+ %2 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
+ ret i32 %2
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
More information about the llvm-commits
mailing list