[llvm] 451af63 - [X86] Remove combineVectorTruncation and delay general vector trunc to lowering
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 13 03:29:51 PDT 2023
Author: Simon Pilgrim
Date: 2023-07-13T11:29:21+01:00
New Revision: 451af635519113bc8fe94852d7489f26485f6689
URL: https://github.com/llvm/llvm-project/commit/451af635519113bc8fe94852d7489f26485f6689
DIFF: https://github.com/llvm/llvm-project/commit/451af635519113bc8fe94852d7489f26485f6689.diff
LOG: [X86] Remove combineVectorTruncation and delay general vector trunc to lowering
Stop folding vector truncations to PACKSS/PACKUS patterns prematurely - another step towards Issue #63710. We still prematurely fold to PACKSS/PACKUS if there are sufficient signbits, that will be addressed in a later patch when we remove combineVectorSignBitsTruncation.
This required ReplaceNodeResults to extend handling of sub-128-bit results to SSSE3 (or later) cases, which has allowed us to improve vXi32->vXi16 truncations to use PSHUFB.
I also tweaked LowerTruncateVecPack to recognise widened truncation source operands so the upper elements remain UNDEF (otherwise truncateVectorWithPACK* will constant fold them to allzeros/allones values).
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/psubus.ll
llvm/test/CodeGen/X86/vector-trunc-packus.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
llvm/test/CodeGen/X86/vector-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e4d74eccd2c3dc..8e79f5ea86a110 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22933,6 +22933,27 @@ static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL,
return SDValue();
}
+ // If the upper half of the source is undef, then attempt to split and
+ // only truncate the lower half.
+ if (DstVT.getSizeInBits() >= 128) {
+ SmallVector<SDValue> SubOps;
+ if (collectConcatOps(In.getNode(), SubOps, DAG)) {
+ ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.end());
+ ArrayRef<SDValue> UpperOps(SubOps.begin(), SubOps.end());
+ LowerOps = LowerOps.drop_back(SubOps.size() / 2);
+ UpperOps = UpperOps.drop_front(SubOps.size() / 2);
+ if (all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
+ MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
+ MVT SrcHalfVT = SrcVT.getHalfNumVectorElementsVT();
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcHalfVT, LowerOps);
+ if (SDValue Res =
+ LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
+ return widenSubVector(Res, false, Subtarget, DAG, DL,
+ DstVT.getSizeInBits());
+ }
+ }
+ }
+
// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
// truncate 2 x v4i32 to v8i16.
@@ -34615,12 +34636,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- // Pre-SSSE3 (or v4i64 -> v4i16) widen the truncation input vector to let
- // LowerTRUNCATE handle this via type legalization.
+ // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
+ // this via type legalization.
if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
(EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
- (!Subtarget.hasSSSE3() || (InVT == MVT::v4i64 && VT == MVT::v4i16)) &&
- !Subtarget.hasAVX()) {
+ (!Subtarget.hasSSSE3() || (InVT == MVT::v8i64 && VT == MVT::v8i8) ||
+ (InVT == MVT::v4i64 && VT == MVT::v4i16 && !Subtarget.hasAVX()))) {
SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
InEltVT.getSizeInBits() * WidenNumElts);
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
@@ -53266,57 +53287,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
-/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
-/// legalization the truncation will be translated into a BUILD_VECTOR with each
-/// element that is extracted from a vector and then truncated, and it is
-///
diff icult to do this optimization based on them.
-/// TODO: Remove this and just use LowerTruncateVecPack.
-static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- EVT OutVT = N->getValueType(0);
- if (!OutVT.isVector())
- return SDValue();
-
- SDValue In = N->getOperand(0);
- if (!In.getValueType().isSimple())
- return SDValue();
-
- EVT InVT = In.getValueType();
- unsigned NumElems = OutVT.getVectorNumElements();
-
- // AVX512 provides fast truncate ops.
- if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
- return SDValue();
-
- EVT OutSVT = OutVT.getVectorElementType();
- EVT InSVT = InVT.getVectorElementType();
- if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
- (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
- NumElems >= 8))
- return SDValue();
-
- // SSSE3's pshufb results in less instructions in the cases below.
- if (Subtarget.hasSSSE3() && NumElems == 8) {
- if (InSVT == MVT::i16)
- return SDValue();
- if (InSVT == MVT::i32 &&
- (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
- return SDValue();
- }
-
- SDLoc DL(N);
- // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
- // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
- // truncate 2 x v4i32 to v8i16.
- if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
- return truncateVectorWithPACKUS(OutVT, In, DL, Subtarget, DAG);
- if (InSVT == MVT::i32)
- return truncateVectorWithPACKSS(OutVT, In, DL, Subtarget, DAG);
-
- return SDValue();
-}
-
/// This function transforms vector truncation of 'extended sign-bits' or
/// 'extended zero-bits' values.
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
@@ -53664,7 +53634,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
return V;
- return combineVectorTruncation(N, DAG, Subtarget);
+ return SDValue();
}
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 5e4c4477a90b51..98c673a7cd17f9 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -1798,50 +1798,92 @@ vector.ph:
}
define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
-; SSE2OR3-LABEL: psubus_16i32_max:
-; SSE2OR3: # %bb.0: # %vector.ph
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2OR3-NEXT: movdqa %xmm5, %xmm8
-; SSE2OR3-NEXT: pxor %xmm7, %xmm8
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
-; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8
-; SSE2OR3-NEXT: pand %xmm9, %xmm5
-; SSE2OR3-NEXT: pxor %xmm8, %xmm9
-; SSE2OR3-NEXT: por %xmm5, %xmm9
-; SSE2OR3-NEXT: pslld $16, %xmm9
-; SSE2OR3-NEXT: psrad $16, %xmm9
-; SSE2OR3-NEXT: movdqa %xmm4, %xmm10
-; SSE2OR3-NEXT: pxor %xmm7, %xmm10
-; SSE2OR3-NEXT: movdqa %xmm6, %xmm5
-; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm5
-; SSE2OR3-NEXT: pand %xmm5, %xmm4
-; SSE2OR3-NEXT: pxor %xmm8, %xmm5
-; SSE2OR3-NEXT: por %xmm4, %xmm5
-; SSE2OR3-NEXT: pslld $16, %xmm5
-; SSE2OR3-NEXT: psrad $16, %xmm5
-; SSE2OR3-NEXT: packssdw %xmm9, %xmm5
-; SSE2OR3-NEXT: movdqa %xmm3, %xmm4
-; SSE2OR3-NEXT: pxor %xmm7, %xmm4
-; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
-; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm9
-; SSE2OR3-NEXT: pand %xmm9, %xmm3
-; SSE2OR3-NEXT: pxor %xmm8, %xmm9
-; SSE2OR3-NEXT: por %xmm3, %xmm9
-; SSE2OR3-NEXT: pslld $16, %xmm9
-; SSE2OR3-NEXT: psrad $16, %xmm9
-; SSE2OR3-NEXT: pxor %xmm2, %xmm7
-; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2OR3-NEXT: pxor %xmm6, %xmm8
-; SSE2OR3-NEXT: pand %xmm2, %xmm6
-; SSE2OR3-NEXT: por %xmm8, %xmm6
-; SSE2OR3-NEXT: pslld $16, %xmm6
-; SSE2OR3-NEXT: psrad $16, %xmm6
-; SSE2OR3-NEXT: packssdw %xmm9, %xmm6
-; SSE2OR3-NEXT: psubusw %xmm6, %xmm0
-; SSE2OR3-NEXT: psubusw %xmm5, %xmm1
-; SSE2OR3-NEXT: retq
+; SSE2-LABEL: psubus_16i32_max:
+; SSE2: # %bb.0: # %vector.ph
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: pxor %xmm7, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm6, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
+; SSE2-NEXT: pand %xmm9, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: por %xmm3, %xmm9
+; SSE2-NEXT: pslld $16, %xmm9
+; SSE2-NEXT: psrad $16, %xmm9
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm7, %xmm3
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm10
+; SSE2-NEXT: por %xmm2, %xmm10
+; SSE2-NEXT: pslld $16, %xmm10
+; SSE2-NEXT: psrad $16, %xmm10
+; SSE2-NEXT: packssdw %xmm9, %xmm10
+; SSE2-NEXT: psubusw %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: pxor %xmm7, %xmm2
+; SSE2-NEXT: movdqa %xmm6, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
+; SSE2-NEXT: pxor %xmm6, %xmm8
+; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: por %xmm8, %xmm6
+; SSE2-NEXT: pslld $16, %xmm6
+; SSE2-NEXT: psrad $16, %xmm6
+; SSE2-NEXT: packssdw %xmm3, %xmm6
+; SSE2-NEXT: psubusw %xmm6, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: psubus_16i32_max:
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm3, %xmm8
+; SSSE3-NEXT: pxor %xmm7, %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8
+; SSSE3-NEXT: pand %xmm9, %xmm3
+; SSSE3-NEXT: pxor %xmm8, %xmm9
+; SSSE3-NEXT: por %xmm3, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm3, %xmm9
+; SSSE3-NEXT: movdqa %xmm2, %xmm10
+; SSSE3-NEXT: pxor %xmm7, %xmm10
+; SSSE3-NEXT: movdqa %xmm6, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
+; SSSE3-NEXT: pand %xmm11, %xmm2
+; SSSE3-NEXT: pxor %xmm8, %xmm11
+; SSSE3-NEXT: por %xmm2, %xmm11
+; SSSE3-NEXT: pshufb %xmm3, %xmm11
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm9[0]
+; SSSE3-NEXT: psubusw %xmm11, %xmm0
+; SSSE3-NEXT: movdqa %xmm5, %xmm2
+; SSSE3-NEXT: pxor %xmm7, %xmm2
+; SSSE3-NEXT: movdqa %xmm6, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: pxor %xmm8, %xmm9
+; SSSE3-NEXT: por %xmm5, %xmm9
+; SSSE3-NEXT: pshufb %xmm3, %xmm9
+; SSSE3-NEXT: pxor %xmm4, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
+; SSSE3-NEXT: pxor %xmm6, %xmm8
+; SSSE3-NEXT: pand %xmm4, %xmm6
+; SSSE3-NEXT: por %xmm8, %xmm6
+; SSSE3-NEXT: pshufb %xmm3, %xmm6
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0]
+; SSSE3-NEXT: psubusw %xmm6, %xmm1
+; SSSE3-NEXT: retq
;
; SSE41-LABEL: psubus_16i32_max:
; SSE41: # %bb.0: # %vector.ph
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index bb9c1b472540d5..a385e3d3244e61 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -2049,57 +2049,106 @@ define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) {
}
define <16 x i16> @trunc_packus_v16i32_v16i16(ptr %p0) "min-legal-vector-width"="256" {
-; SSE2-SSSE3-LABEL: trunc_packus_v16i32_v16i16:
-; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm3
-; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm0
-; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm4
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
-; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm2
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3
-; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm2
-; SSE2-SSSE3-NEXT: por %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
-; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm3
-; SSE2-SSSE3-NEXT: por %xmm1, %xmm3
-; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4
-; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm6
-; SSE2-SSSE3-NEXT: por %xmm4, %xmm6
-; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
-; SSE2-SSSE3-NEXT: pandn %xmm5, %xmm4
-; SSE2-SSSE3-NEXT: por %xmm0, %xmm4
-; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm4
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4
-; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3
-; SSE2-SSSE3-NEXT: pslld $16, %xmm3
-; SSE2-SSSE3-NEXT: psrad $16, %xmm3
-; SSE2-SSSE3-NEXT: pslld $16, %xmm0
-; SSE2-SSSE3-NEXT: psrad $16, %xmm0
-; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm0
-; SSE2-SSSE3-NEXT: pslld $16, %xmm4
-; SSE2-SSSE3-NEXT: psrad $16, %xmm4
-; SSE2-SSSE3-NEXT: pslld $16, %xmm1
-; SSE2-SSSE3-NEXT: psrad $16, %xmm1
-; SSE2-SSSE3-NEXT: packssdw %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: retq
+; SSE2-LABEL: trunc_packus_v16i32_v16i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: movdqa 16(%rdi), %xmm3
+; SSE2-NEXT: movdqa 32(%rdi), %xmm0
+; SSE2-NEXT: movdqa 48(%rdi), %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm5, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm3, %xmm0
+; SSE2-NEXT: pslld $16, %xmm4
+; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: packssdw %xmm4, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc_packus_v16i32_v16i16:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa (%rdi), %xmm1
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm3
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm0
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
+; SSSE3-NEXT: movdqa %xmm5, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm5, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm5, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pandn %xmm5, %xmm3
+; SSSE3-NEXT: por %xmm1, %xmm3
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: pandn %xmm5, %xmm6
+; SSSE3-NEXT: por %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm5, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pandn %xmm5, %xmm4
+; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm6, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: pshufb %xmm2, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSSE3-NEXT: pshufb %xmm2, %xmm4
+; SSSE3-NEXT: pshufb %xmm2, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v16i32_v16i16:
; SSE41: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index cfbde2aeccf70b..e70443c81e1fc5 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -1559,52 +1559,96 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
}
define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) {
-; SSE2-SSSE3-LABEL: trunc_usat_v16i32_v16i16:
-; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5
-; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm4
-; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm0
-; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm8
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm3
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm7
-; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm1
-; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0
-; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm8
-; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm3
-; SSE2-SSSE3-NEXT: por %xmm8, %xmm3
-; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8
-; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm8
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm0
-; SSE2-SSSE3-NEXT: pand %xmm0, %xmm5
-; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm0
-; SSE2-SSSE3-NEXT: por %xmm5, %xmm0
-; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm6
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm7
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: por %xmm7, %xmm2
-; SSE2-SSSE3-NEXT: pslld $16, %xmm2
-; SSE2-SSSE3-NEXT: psrad $16, %xmm2
-; SSE2-SSSE3-NEXT: pslld $16, %xmm0
-; SSE2-SSSE3-NEXT: psrad $16, %xmm0
-; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pslld $16, %xmm3
-; SSE2-SSSE3-NEXT: psrad $16, %xmm3
-; SSE2-SSSE3-NEXT: pslld $16, %xmm1
-; SSE2-SSSE3-NEXT: psrad $16, %xmm1
-; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm1
-; SSE2-SSSE3-NEXT: retq
+; SSE2-LABEL: trunc_usat_v16i32_v16i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm5
+; SSE2-NEXT: movdqa 16(%rdi), %xmm4
+; SSE2-NEXT: movdqa 32(%rdi), %xmm0
+; SSE2-NEXT: movdqa 48(%rdi), %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm6, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm7, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: pxor %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm8
+; SSE2-NEXT: pxor %xmm7, %xmm3
+; SSE2-NEXT: por %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm5, %xmm8
+; SSE2-NEXT: pxor %xmm6, %xmm8
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm7, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm7
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm2, %xmm0
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: packssdw %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc_usat_v16i32_v16i16:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa (%rdi), %xmm5
+; SSSE3-NEXT: movdqa 16(%rdi), %xmm3
+; SSSE3-NEXT: movdqa 32(%rdi), %xmm0
+; SSSE3-NEXT: movdqa 48(%rdi), %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm6, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm7, %xmm1
+; SSSE3-NEXT: por %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm8, %xmm0
+; SSSE3-NEXT: pxor %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm8
+; SSSE3-NEXT: pxor %xmm7, %xmm4
+; SSSE3-NEXT: por %xmm8, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm8
+; SSSE3-NEXT: pxor %xmm6, %xmm8
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm7, %xmm0
+; SSSE3-NEXT: por %xmm5, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT: pxor %xmm2, %xmm7
+; SSSE3-NEXT: pand %xmm3, %xmm2
+; SSSE3-NEXT: por %xmm7, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm3, %xmm2
+; SSSE3-NEXT: pshufb %xmm3, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: pshufb %xmm3, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v16i32_v16i16:
; SSE41: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 92ee258c13a743..63f3d571edd4c7 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -577,21 +577,34 @@ entry:
}
define void @trunc16i32_16i16(<16 x i32> %a) {
-; SSE2-SSSE3-LABEL: trunc16i32_16i16:
-; SSE2-SSSE3: # %bb.0: # %entry
-; SSE2-SSSE3-NEXT: pslld $16, %xmm1
-; SSE2-SSSE3-NEXT: psrad $16, %xmm1
-; SSE2-SSSE3-NEXT: pslld $16, %xmm0
-; SSE2-SSSE3-NEXT: psrad $16, %xmm0
-; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pslld $16, %xmm3
-; SSE2-SSSE3-NEXT: psrad $16, %xmm3
-; SSE2-SSSE3-NEXT: pslld $16, %xmm2
-; SSE2-SSSE3-NEXT: psrad $16, %xmm2
-; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: movdqu %xmm2, (%rax)
-; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax)
-; SSE2-SSSE3-NEXT: retq
+; SSE2-LABEL: trunc16i32_16i16:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
+; SSE2-NEXT: movdqu %xmm2, (%rax)
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc16i32_16i16:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm4, %xmm1
+; SSSE3-NEXT: pshufb %xmm4, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: pshufb %xmm4, %xmm3
+; SSSE3-NEXT: pshufb %xmm4, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT: movdqu %xmm2, (%rax)
+; SSSE3-NEXT: movdqu %xmm0, (%rax)
+; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc16i32_16i16:
; SSE41: # %bb.0: # %entry
@@ -1600,7 +1613,7 @@ define <32 x i8> @trunc2x16i16_32i8(<16 x i16> %a, <16 x i16> %b) {
;
; AVX2-LABEL: trunc2x16i16_32i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
@@ -1646,29 +1659,13 @@ entry:
}
define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: trunc2x8i16_16i8:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc2x8i16_16i8:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc2x8i16_16i8:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: packuswb %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc2x8i16_16i8:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc2x8i16_16i8:
; AVX1: # %bb.0: # %entry
More information about the llvm-commits
mailing list