[llvm] a3b57bc - [PowerPC] remove side effect for some cases for saturate instructions
Chen Zheng via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 13 18:38:10 PDT 2023
Author: Chen Zheng
Date: 2023-03-13T21:37:56-04:00
New Revision: a3b57bca97c0bf56ee1db319a5ff633b516f927e
URL: https://github.com/llvm/llvm-project/commit/a3b57bca97c0bf56ee1db319a5ff633b516f927e
DIFF: https://github.com/llvm/llvm-project/commit/a3b57bca97c0bf56ee1db319a5ff633b516f927e.diff
LOG: [PowerPC] remove side effect for some cases for saturate instructions
Fixes #60684
Reviewed By: nemanjai
Differential Revision: https://reviews.llvm.org/D145353
Added:
Modified:
llvm/lib/Target/PowerPC/PPCISelLowering.cpp
llvm/test/CodeGen/PowerPC/vector-sum-sat-bit-side-effect.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index cb3b31e80ced3..4011aaff50a0a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -15787,16 +15787,37 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
break;
case ISD::INTRINSIC_W_CHAIN:
- // For little endian, VSX loads require generating lxvd2x/xxswapd.
- // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
- if (Subtarget.needsSwapsForVSXMemOps()) {
- switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
- default:
- break;
- case Intrinsic::ppc_vsx_lxvw4x:
- case Intrinsic::ppc_vsx_lxvd2x:
- return expandVSXLoadForLE(N, DCI);
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ default:
+ break;
+ case Intrinsic::ppc_altivec_vsum4sbs:
+ case Intrinsic::ppc_altivec_vsum4shs:
+ case Intrinsic::ppc_altivec_vsum4ubs: {
+ // These sum-across intrinsics only have a chain due to the side effect
+ // that they may set the SAT bit. If we know the SAT bit will not be set
+ // for some inputs, we can replace any uses of their chain with the input
+ // chain.
+ if (BuildVectorSDNode *BVN =
+ dyn_cast<BuildVectorSDNode>(N->getOperand(3))) {
+ APInt APSplatBits, APSplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ bool BVNIsConstantSplat = BVN->isConstantSplat(
+ APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0,
+ !Subtarget.isLittleEndian());
+ // If the constant splat vector is 0, the SAT bit will not be set.
+ if (BVNIsConstantSplat && APSplatBits == 0)
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), N->getOperand(0));
}
+ return SDValue();
+ }
+ case Intrinsic::ppc_vsx_lxvw4x:
+ case Intrinsic::ppc_vsx_lxvd2x:
+ // For little endian, VSX loads require generating lxvd2x/xxswapd.
+ // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
+ if (Subtarget.needsSwapsForVSXMemOps())
+ return expandVSXLoadForLE(N, DCI);
+ break;
}
break;
case ISD::INTRINSIC_VOID:
diff --git a/llvm/test/CodeGen/PowerPC/vector-sum-sat-bit-side-effect.ll b/llvm/test/CodeGen/PowerPC/vector-sum-sat-bit-side-effect.ll
index 12f4a71afeef9..e371d7c70f3e3 100644
--- a/llvm/test/CodeGen/PowerPC/vector-sum-sat-bit-side-effect.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-sum-sat-bit-side-effect.ll
@@ -9,8 +9,6 @@
define void @test1(<16 x i8> %0) {
; CHECK-LABEL: test1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xxlxor v3, v3, v3
-; CHECK-NEXT: vsum4sbs v2, v2, v3
; CHECK-NEXT: blr
entry:
%1 = tail call <4 x i32> @llvm.ppc.altivec.vsum4sbs(<16 x i8> %0, <4 x i32> zeroinitializer)
@@ -20,8 +18,6 @@ entry:
define void @test2(<8 x i16> %0) {
; CHECK-LABEL: test2:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xxlxor v3, v3, v3
-; CHECK-NEXT: vsum4shs v2, v2, v3
; CHECK-NEXT: blr
entry:
%1 = tail call <4 x i32> @llvm.ppc.altivec.vsum4shs(<8 x i16> %0, <4 x i32> zeroinitializer)
@@ -31,8 +27,6 @@ entry:
define void @test3(<16 x i8> %0) {
; CHECK-LABEL: test3:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xxlxor v3, v3, v3
-; CHECK-NEXT: vsum4ubs v2, v2, v3
; CHECK-NEXT: blr
entry:
%1 = tail call <4 x i32> @llvm.ppc.altivec.vsum4ubs(<16 x i8> %0, <4 x i32> zeroinitializer)
@@ -108,9 +102,8 @@ entry:
define <4 x i32> @test10(<16 x i8> %0, <16 x i8> %1) {
; CHECK-LABEL: test10:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xxlxor v4, v4, v4
-; CHECK-NEXT: vsum4sbs v2, v2, v4
-; CHECK-NEXT: vsum4sbs v3, v3, v4
+; CHECK-NEXT: xxlxor v3, v3, v3
+; CHECK-NEXT: vsum4sbs v2, v2, v3
; CHECK-NEXT: blr
entry:
%2 = tail call <4 x i32> @llvm.ppc.altivec.vsum4sbs(<16 x i8> %0, <4 x i32> zeroinitializer)
@@ -121,9 +114,8 @@ entry:
define <4 x i32> @test11(<8 x i16> %0, <8 x i16> %1) {
; CHECK-LABEL: test11:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xxlxor v4, v4, v4
-; CHECK-NEXT: vsum4shs v2, v2, v4
-; CHECK-NEXT: vsum4shs v3, v3, v4
+; CHECK-NEXT: xxlxor v3, v3, v3
+; CHECK-NEXT: vsum4shs v2, v2, v3
; CHECK-NEXT: blr
entry:
%2 = tail call <4 x i32> @llvm.ppc.altivec.vsum4shs(<8 x i16> %0, <4 x i32> zeroinitializer)
@@ -134,9 +126,8 @@ entry:
define <4 x i32> @test12(<16 x i8> %0, <16 x i8> %1) {
; CHECK-LABEL: test12:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xxlxor v4, v4, v4
-; CHECK-NEXT: vsum4ubs v2, v2, v4
-; CHECK-NEXT: vsum4ubs v3, v3, v4
+; CHECK-NEXT: xxlxor v3, v3, v3
+; CHECK-NEXT: vsum4ubs v2, v2, v3
; CHECK-NEXT: blr
entry:
%2 = tail call <4 x i32> @llvm.ppc.altivec.vsum4ubs(<16 x i8> %0, <4 x i32> zeroinitializer)
More information about the llvm-commits
mailing list