[llvm] r325494 - [X86][SSE] combineTruncateWithSat - use truncateVectorWithPACK down to 64-bit subvectors
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 19 05:29:20 PST 2018
Author: rksimon
Date: Mon Feb 19 05:29:20 2018
New Revision: 325494
URL: http://llvm.org/viewvc/llvm-project?rev=325494&view=rev
Log:
[X86][SSE] combineTruncateWithSat - use truncateVectorWithPACK down to 64-bit subvectors
Add support for chaining PACKSS/PACKUS down to 64-bit vectors by using only a single 128-bit input.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll
llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=325494&r1=325493&r2=325494&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Feb 19 05:29:20 2018
@@ -16597,8 +16597,8 @@ static SDValue truncateVectorWithPACK(un
assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode");
- // Requires SSE2 but AVX512 has fast truncate.
- if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ // Requires SSE2 but AVX512 has fast vector truncate.
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
return SDValue();
EVT SrcVT = In.getValueType();
@@ -16607,25 +16607,23 @@ static SDValue truncateVectorWithPACK(un
if (SrcVT == DstVT)
return In;
- // We only support vector truncation to 128bits or greater from a
- // 256bits or greater source.
+ // We only support vector truncation to 64bits or greater from a
+ // 128bits or greater source.
unsigned DstSizeInBits = DstVT.getSizeInBits();
unsigned SrcSizeInBits = SrcVT.getSizeInBits();
- if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
+ if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
return SDValue();
- LLVMContext &Ctx = *DAG.getContext();
unsigned NumElems = SrcVT.getVectorNumElements();
+ if (!isPowerOf2_32(NumElems))
+ return SDValue();
+
+ LLVMContext &Ctx = *DAG.getContext();
assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
- // Extract lower/upper subvectors.
- unsigned NumSubElts = NumElems / 2;
- SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
- SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
-
// Pack to the largest type possible:
// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
EVT InVT = MVT::i16, OutVT = MVT::i8;
@@ -16635,12 +16633,27 @@ static SDValue truncateVectorWithPACK(un
OutVT = MVT::i16;
}
+ // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
+ if (SrcVT.is128BitVector()) {
+ InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
+ OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT,
+ DAG.getBitcast(InVT, In), DAG.getUNDEF(InVT));
+ Res = extractSubVector(Res, 0, DAG, DL, 64);
+ return DAG.getBitcast(DstVT, Res);
+ }
+
+ // Extract lower/upper subvectors.
+ unsigned NumSubElts = NumElems / 2;
+ SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+ SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+
unsigned SubSizeInBits = SrcSizeInBits / 2;
InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
- if (SrcVT.is256BitVector()) {
+ if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
Lo = DAG.getBitcast(InVT, Lo);
Hi = DAG.getBitcast(InVT, Hi);
SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
@@ -16669,7 +16682,7 @@ static SDValue truncateVectorWithPACK(un
}
// Recursively pack lower/upper subvectors, concat result and pack again.
- assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
+ assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll?rev=325494&r1=325493&r2=325494&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll Mon Feb 19 05:29:20 2018
@@ -3099,98 +3099,27 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
}
define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
-; SSE2-LABEL: trunc_packus_v8i32_v8i8_store:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: packuswb %xmm5, %xmm4
-; SSE2-NEXT: packuswb %xmm4, %xmm4
-; SSE2-NEXT: movq %xmm4, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v8i32_v8i8_store:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm1
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm0, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm0, %xmm1
-; SSSE3-NEXT: pshufb %xmm0, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movq %xmm2, (%rdi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v8i32_v8i8_store:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pminsd %xmm2, %xmm0
-; SSE41-NEXT: pminsd %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmaxsd %xmm2, %xmm1
-; SSE41-NEXT: pmaxsd %xmm2, %xmm0
-; SSE41-NEXT: packssdw %xmm1, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc_packus_v8i32_v8i8_store:
+; SSE: # %bb.0:
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: movq %xmm0, (%rdi)
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i32_v8i8_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i32_v8i8_store:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll?rev=325494&r1=325493&r2=325494&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-ssat.ll Mon Feb 19 05:29:20 2018
@@ -2959,93 +2959,17 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<
}
define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
-; SSE2-LABEL: trunc_ssat_v8i32_v8i8_store:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: packuswb %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: movq %xmm2, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v8i32_v8i8_store:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm1
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm0, %xmm2
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm3
-; SSSE3-NEXT: pandn %xmm0, %xmm1
-; SSSE3-NEXT: por %xmm3, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm0, %xmm1
-; SSSE3-NEXT: pshufb %xmm0, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movq %xmm2, (%rdi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_ssat_v8i32_v8i8_store:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
-; SSE41-NEXT: pminsd %xmm2, %xmm0
-; SSE41-NEXT: pminsd %xmm2, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168]
-; SSE41-NEXT: pmaxsd %xmm2, %xmm1
-; SSE41-NEXT: pmaxsd %xmm2, %xmm0
-; SSE41-NEXT: packssdw %xmm1, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc_ssat_v8i32_v8i8_store:
+; SSE: # %bb.0:
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: movq %xmm0, (%rdi)
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_ssat_v8i32_v8i8_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
@@ -3053,10 +2977,6 @@ define void @trunc_ssat_v8i32_v8i8_store
;
; AVX2-LABEL: trunc_ssat_v8i32_v8i8_store:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
-; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
More information about the llvm-commits
mailing list