[llvm] b0a77af - [DAG] SimplifyDemandedBits - add sra(shl(x,c1),c1) -> sign_extend_inreg(x) demanded elts fold
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 15 08:32:18 PDT 2023
Author: Simon Pilgrim
Date: 2023-08-15T16:32:03+01:00
New Revision: b0a77af4f19a4f6b49ac4aadd0a9c89d287b74ce
URL: https://github.com/llvm/llvm-project/commit/b0a77af4f19a4f6b49ac4aadd0a9c89d287b74ce
DIFF: https://github.com/llvm/llvm-project/commit/b0a77af4f19a4f6b49ac4aadd0a9c89d287b74ce.diff
LOG: [DAG] SimplifyDemandedBits - add sra(shl(x,c1),c1) -> sign_extend_inreg(x) demanded elts fold
Move the sra(shl(x,c1),c1) -> sign_extend_inreg(x) fold inside SimplifyDemandedBits so we can recognize hidden splats with DemandedElts masks.
Because the c1 shift amount has multiple uses, hidden splats won't get simplified to a splat constant buildvector - meaning the existing fold in DAGCombiner::visitSRA can't fire as it won't see a uniform shift amount.
I also needed to add TLI preferSextInRegOfTruncate hook to help keep truncate(sign_extend_inreg(x)) vector patterns on X86 so we can use PACKSS more efficiently.
Differential Revision: https://reviews.llvm.org/D157972
Added:
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.h
llvm/test/CodeGen/X86/packss.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 6daf623665dafe..ce12abe952c9a5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -824,6 +824,14 @@ class TargetLoweringBase {
// Return true if the target wants to transform Op(Splat(X)) -> Splat(Op(X))
virtual bool preferScalarizeSplat(SDNode *N) const { return true; }
+ // Return true if the target wants to transform:
+ // (TruncVT truncate(sext_in_reg(VT X, ExtVT))
+ // -> (TruncVT sext_in_reg(truncate(VT X), ExtVT))
+ // Some targets might prefer pre-sextinreg to improve truncation/saturation.
+ virtual bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const {
+ return true;
+ }
+
/// Return true if the target wants to use the optimization that
/// turns ext(promotableInst1(...(promotableInstN(load)))) into
/// promotedInst1(...(promotedInstN(ext(load)))).
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7ebfecab33bc08..c2206b77d8ad82 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10134,25 +10134,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
- // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
- // sext_inreg.
ConstantSDNode *N1C = isConstOrConstSplat(N1);
- if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
- unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
- EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
- if (VT.isVector())
- ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT,
- VT.getVectorElementCount());
- if (!LegalOperations ||
- TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) ==
- TargetLowering::Legal)
- return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
- N0.getOperand(0), DAG.getValueType(ExtVT));
- // Even if we can't convert to sext_inreg, we might be able to remove
- // this shift pair if the input is already sign extended.
- if (DAG.ComputeNumSignBits(N0.getOperand(0)) > N1C->getZExtValue())
- return N0.getOperand(0);
- }
// fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
// clamp (add c1, c2) to max shift.
@@ -14374,7 +14356,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
SDValue X = N0.getOperand(0);
SDValue ExtVal = N0.getOperand(1);
EVT ExtVT = cast<VTSDNode>(ExtVal)->getVT();
- if (ExtVT.bitsLT(VT)) {
+ if (ExtVT.bitsLT(VT) && TLI.preferSextInRegOfTruncate(VT, SrcVT, ExtVT)) {
SDValue TrX = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X);
return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, TrX, ExtVal);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c01189c76e08e6..2b44b57ab2a32c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1945,6 +1945,35 @@ bool TargetLowering::SimplifyDemandedBits(
if (ShAmt == 0)
return TLO.CombineTo(Op, Op0);
+ // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target
+ // supports sext_inreg.
+ if (Op0.getOpcode() == ISD::SHL) {
+ if (const APInt *InnerSA =
+ TLO.DAG.getValidShiftAmountConstant(Op0, DemandedElts)) {
+ unsigned LowBits = BitWidth - ShAmt;
+ EVT ExtVT = EVT::getIntegerVT(*TLO.DAG.getContext(), LowBits);
+ if (VT.isVector())
+ ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtVT,
+ VT.getVectorElementCount());
+
+ if (*InnerSA == ShAmt) {
+ if (!TLO.LegalOperations() ||
+ getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) == Legal)
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT,
+ Op0.getOperand(0),
+ TLO.DAG.getValueType(ExtVT)));
+
+ // Even if we can't convert to sext_inreg, we might be able to
+ // remove this shift pair if the input is already sign extended.
+ unsigned NumSignBits =
+ TLO.DAG.ComputeNumSignBits(Op0.getOperand(0), DemandedElts);
+ if (NumSignBits > ShAmt)
+ return TLO.CombineTo(Op, Op0.getOperand(0));
+ }
+ }
+ }
+
APInt InDemandedMask = (DemandedBits << ShAmt);
// If the shift is exact, then it does demand the low bits (and knows that
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d65d15aac3bbbc..559500df90242e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55982,6 +55982,12 @@ bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {
return false;
}
+// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
+bool X86TargetLowering::preferSextInRegOfTruncate(EVT TruncVT, EVT VT,
+ EVT ExtVT) const {
+ return Subtarget.hasAVX512() || !VT.isVector();
+}
+
bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
if (!isTypeLegal(VT))
return false;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 77b2c081e2097e..b3b00dbf3c07da 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1054,6 +1054,9 @@ namespace llvm {
bool preferABDSToABSWithNSW(EVT VT) const override;
+ bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT,
+ EVT ExtVT) const override;
+
/// Return true if the target has native support for
/// the specified value type and it is 'desirable' to use the type for the
/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll
index ec14c95f0fc625..35919f65d3de0a 100644
--- a/llvm/test/CodeGen/X86/packss.ll
+++ b/llvm/test/CodeGen/X86/packss.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X86-SSE
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X86-SSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX,X86-AVX1
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX,X86-AVX2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X64-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX,X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX,X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX
define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind {
; SSE2-LABEL: trunc_ashr_v4i64:
@@ -175,107 +175,55 @@ define <8 x i16> @trunc_ashr_v4i32_icmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwi
}
define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
-; X86-SSE-LABEL: trunc_ashr_v4i64_demandedelts:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: psllq $63, %xmm1
-; X86-SSE-NEXT: psllq $63, %xmm0
-; X86-SSE-NEXT: psrlq $63, %xmm0
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,0,2147483648]
-; X86-SSE-NEXT: pxor %xmm2, %xmm0
-; X86-SSE-NEXT: psubq %xmm2, %xmm0
-; X86-SSE-NEXT: psrlq $63, %xmm1
-; X86-SSE-NEXT: pxor %xmm2, %xmm1
-; X86-SSE-NEXT: psubq %xmm2, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X86-SSE-NEXT: packssdw %xmm1, %xmm0
-; X86-SSE-NEXT: retl
-;
-; X86-AVX1-LABEL: trunc_ashr_v4i64_demandedelts:
-; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm1
-; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,0,1,0]
-; X86-AVX1-NEXT: # xmm2 = mem[0,0]
-; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
-; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
-; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT: vzeroupper
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX2-LABEL: trunc_ashr_v4i64_demandedelts:
-; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0]
-; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1]
-; X86-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,0,0,2147483648,1,0,0,2147483648]
-; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1]
-; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; X86-AVX2-NEXT: vzeroupper
-; X86-AVX2-NEXT: retl
+; SSE2-LABEL: trunc_ashr_v4i64_demandedelts:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psllq $63, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: ret{{[l|q]}}
;
-; X64-SSE-LABEL: trunc_ashr_v4i64_demandedelts:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: psllq $63, %xmm1
-; X64-SSE-NEXT: psllq $63, %xmm0
-; X64-SSE-NEXT: psrlq $63, %xmm0
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,9223372036854775808]
-; X64-SSE-NEXT: pxor %xmm2, %xmm0
-; X64-SSE-NEXT: psubq %xmm2, %xmm0
-; X64-SSE-NEXT: psrlq $63, %xmm1
-; X64-SSE-NEXT: pxor %xmm2, %xmm1
-; X64-SSE-NEXT: psubq %xmm2, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-SSE-NEXT: packssdw %xmm1, %xmm0
-; X64-SSE-NEXT: retq
+; SSE4-LABEL: trunc_ashr_v4i64_demandedelts:
+; SSE4: # %bb.0:
+; SSE4-NEXT: psllq $63, %xmm0
+; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: pxor %xmm3, %xmm3
+; SSE4-NEXT: pcmpgtq %xmm0, %xmm3
+; SSE4-NEXT: psllq $63, %xmm1
+; SSE4-NEXT: pcmpgtq %xmm1, %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
+; SSE4-NEXT: packssdw %xmm1, %xmm0
+; SSE4-NEXT: ret{{[l|q]}}
;
-; X64-AVX1-LABEL: trunc_ashr_v4i64_demandedelts:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpsllq $63, %xmm0, %xmm1
-; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X64-AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
-; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,9223372036854775808]
-; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vzeroupper
-; X64-AVX1-NEXT: retq
+; AVX1-LABEL: trunc_ashr_v4i64_demandedelts:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: ret{{[l|q]}}
;
-; X64-AVX2-LABEL: trunc_ashr_v4i64_demandedelts:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
-; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9223372036854775808,1,9223372036854775808]
-; X64-AVX2-NEXT: # ymm1 = mem[0,1,0,1]
-; X64-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
+; AVX2-LABEL: trunc_ashr_v4i64_demandedelts:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: ret{{[l|q]}}
%1 = shl <4 x i64> %a0, <i64 63, i64 0, i64 63, i64 0>
%2 = ashr <4 x i64> %1, <i64 63, i64 0, i64 63, i64 0>
%3 = bitcast <4 x i64> %2 to <8 x i32>
More information about the llvm-commits
mailing list