[llvm] 2ed914c - [X86][SSE] getFauxShuffleMask - handle PACKSS(SRAI(),SRAI()) shuffle patterns.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 12 06:09:03 PST 2021
Author: Simon Pilgrim
Date: 2021-01-12T14:07:53Z
New Revision: 2ed914cb7e9c0737bdf60a0b1fd48b6499973325
URL: https://github.com/llvm/llvm-project/commit/2ed914cb7e9c0737bdf60a0b1fd48b6499973325
DIFF: https://github.com/llvm/llvm-project/commit/2ed914cb7e9c0737bdf60a0b1fd48b6499973325.diff
LOG: [X86][SSE] getFauxShuffleMask - handle PACKSS(SRAI(),SRAI()) shuffle patterns.
We can't easily treat ASHR a faux shuffle, but if it was just feeding a PACKSS then it was likely being used as sign-extension for a truncation, so just peek through and adjust the mask accordingly.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/psubus.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 750c809eafca..f28e28689806 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7685,12 +7685,26 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
// If we know input saturation won't happen (or we don't care for particular
// lanes), we can treat this as a truncation shuffle.
+ bool Offset0 = false, Offset1 = false;
if (Opcode == X86ISD::PACKSS) {
if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
(!(N1.isUndef() || EltsRHS.isNullValue()) &&
DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
return false;
+ // We can't easily fold ASHR into a shuffle, but if it was feeding a
+ // PACKSS then it was likely being used for sign-extension for a
+ // truncation, so just peek through and adjust the mask accordingly.
+ if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
+ N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
+ Offset0 = true;
+ N0 = N0.getOperand(0);
+ }
+ if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
+ N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
+ Offset1 = true;
+ N1 = N1.getOperand(0);
+ }
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
@@ -7707,6 +7721,13 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
Ops.push_back(N1);
createPackShuffleMask(VT, Mask, IsUnary);
+
+ if (Offset0 || Offset1) {
+ for (int &M : Mask)
+ if ((Offset0 && isInRange(M, 0, NumElts)) ||
+ (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
+ ++M;
+ }
return true;
}
case X86ISD::VTRUNC: {
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 06240cd8bad3..351629a732c1 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -1403,11 +1403,6 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: packssdw %xmm6, %xmm5
; SSE2-NEXT: psubusw %xmm5, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: psubus_8i32_max:
@@ -1738,111 +1733,91 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
; SSE2-LABEL: psubus_16i32_max:
; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: movdqa %xmm5, %xmm8
; SSE2-NEXT: pxor %xmm9, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183]
; SSE2-NEXT: movdqa %xmm7, %xmm6
; SSE2-NEXT: pcmpgtd %xmm8, %xmm6
; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
-; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
; SSE2-NEXT: pslld $16, %xmm6
; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: movdqa %xmm4, %xmm10
; SSE2-NEXT: pxor %xmm9, %xmm10
-; SSE2-NEXT: movdqa %xmm7, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: packssdw %xmm6, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pslld $16, %xmm5
+; SSE2-NEXT: psrad $16, %xmm5
+; SSE2-NEXT: packssdw %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm9, %xmm4
; SSE2-NEXT: movdqa %xmm7, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: por %xmm3, %xmm6
; SSE2-NEXT: pslld $16, %xmm6
; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm9
+; SSE2-NEXT: pxor %xmm2, %xmm9
; SSE2-NEXT: pcmpgtd %xmm9, %xmm7
; SSE2-NEXT: pxor %xmm7, %xmm8
-; SSE2-NEXT: pand %xmm4, %xmm7
+; SSE2-NEXT: pand %xmm2, %xmm7
; SSE2-NEXT: por %xmm8, %xmm7
; SSE2-NEXT: pslld $16, %xmm7
; SSE2-NEXT: psrad $16, %xmm7
; SSE2-NEXT: packssdw %xmm6, %xmm7
-; SSE2-NEXT: psubusw %xmm7, %xmm1
-; SSE2-NEXT: psubusw %xmm3, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm2, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm2, %xmm1
+; SSE2-NEXT: psubusw %xmm7, %xmm0
+; SSE2-NEXT: psubusw %xmm5, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: psubus_16i32_max:
; SSSE3: # %bb.0: # %vector.ph
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm3, %xmm8
+; SSSE3-NEXT: movdqa %xmm5, %xmm8
; SSSE3-NEXT: pxor %xmm9, %xmm8
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183]
; SSSE3-NEXT: movdqa %xmm7, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8
-; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pand %xmm6, %xmm5
; SSSE3-NEXT: pxor %xmm8, %xmm6
-; SSSE3-NEXT: por %xmm3, %xmm6
+; SSSE3-NEXT: por %xmm5, %xmm6
; SSSE3-NEXT: pslld $16, %xmm6
; SSSE3-NEXT: psrad $16, %xmm6
-; SSSE3-NEXT: movdqa %xmm2, %xmm10
+; SSSE3-NEXT: movdqa %xmm4, %xmm10
; SSSE3-NEXT: pxor %xmm9, %xmm10
-; SSSE3-NEXT: movdqa %xmm7, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pslld $16, %xmm3
-; SSSE3-NEXT: psrad $16, %xmm3
-; SSSE3-NEXT: packssdw %xmm6, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm2
-; SSSE3-NEXT: pxor %xmm9, %xmm2
+; SSSE3-NEXT: movdqa %xmm7, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pslld $16, %xmm5
+; SSSE3-NEXT: psrad $16, %xmm5
+; SSSE3-NEXT: packssdw %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm9, %xmm4
; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pand %xmm6, %xmm3
; SSSE3-NEXT: pxor %xmm8, %xmm6
-; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: por %xmm3, %xmm6
; SSSE3-NEXT: pslld $16, %xmm6
; SSSE3-NEXT: psrad $16, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm9
+; SSSE3-NEXT: pxor %xmm2, %xmm9
; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7
; SSSE3-NEXT: pxor %xmm7, %xmm8
-; SSSE3-NEXT: pand %xmm4, %xmm7
+; SSSE3-NEXT: pand %xmm2, %xmm7
; SSSE3-NEXT: por %xmm8, %xmm7
; SSSE3-NEXT: pslld $16, %xmm7
; SSSE3-NEXT: psrad $16, %xmm7
; SSSE3-NEXT: packssdw %xmm6, %xmm7
-; SSSE3-NEXT: psubusw %xmm7, %xmm1
-; SSSE3-NEXT: psubusw %xmm3, %xmm0
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: psrad $16, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: packssdw %xmm2, %xmm0
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: psrad $16, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: packssdw %xmm2, %xmm1
+; SSSE3-NEXT: psubusw %xmm7, %xmm0
+; SSSE3-NEXT: psubusw %xmm5, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: psubus_16i32_max:
@@ -1923,11 +1898,6 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: packssdw %xmm6, %xmm5
; SSE2-NEXT: psubusw %xmm5, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: psubus_i16_i32_max_swapped:
More information about the llvm-commits
mailing list