[llvm] r340212 - [X86] Teach combineTruncatedArithmetic to handle some cases of ISD::SUB
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 20 13:57:35 PDT 2018
Author: ctopper
Date: Mon Aug 20 13:57:35 2018
New Revision: 340212
URL: http://llvm.org/viewvc/llvm-project?rev=340212&view=rev
Log:
[X86] Teach combineTruncatedArithmetic to handle some cases of ISD::SUB
We can safely avoid interfering with the subus combine if both inputs are freely truncatable. Either both extends, or an extend and a constant vector.
Differential Revision: https://reviews.llvm.org/D50878
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=340212&r1=340211&r2=340212&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Aug 20 13:57:35 2018
@@ -36795,34 +36795,20 @@ static SDValue combineTruncatedArithmeti
EVT VT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
- auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
+ auto IsFreeTruncation = [VT](SDValue Op) {
unsigned TruncSizeInBits = VT.getScalarSizeInBits();
- // Repeated operand, so we are only trading one output truncation for
- // one input truncation.
- if (Op0 == Op1)
- return true;
-
- // See if either operand has been extended from a smaller/equal size to
+ // See if this has been extended from a smaller/equal size to
// the truncation size, allowing a truncation to combine with the extend.
- unsigned Opcode0 = Op0.getOpcode();
- if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
- Opcode0 == ISD::ZERO_EXTEND) &&
- Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
+ unsigned Opcode = Op.getOpcode();
+ if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
+ Opcode == ISD::ZERO_EXTEND) &&
+ Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
return true;
- unsigned Opcode1 = Op1.getOpcode();
- if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
- Opcode1 == ISD::ZERO_EXTEND) &&
- Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
- return true;
-
- // See if either operand is a single use constant which can be constant
- // folded.
- SDValue BC0 = peekThroughOneUseBitcasts(Op0);
- SDValue BC1 = peekThroughOneUseBitcasts(Op1);
- return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
- ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
+ // See if this is a single use constant which can be constant folded.
+ SDValue BC = peekThroughOneUseBitcasts(Op);
+ return ISD::isBuildVectorOfConstantSDNodes(BC.getNode());
};
auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
@@ -36850,7 +36836,7 @@ static SDValue combineTruncatedArithmeti
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
- IsRepeatedOpOrFreeTruncation(Op0, Op1))
+ (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
}
@@ -36863,11 +36849,20 @@ static SDValue combineTruncatedArithmeti
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
case ISD::ADD: {
- // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(Opcode, VT) &&
- IsRepeatedOpOrFreeTruncation(Op0, Op1))
+ (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
+ return TruncateArithmetic(Op0, Op1);
+ break;
+ }
+ case ISD::SUB: {
+ // TODO: ISD::SUB We are conservative and require both sides to be freely
+ // truncatable to avoid interfering with combineSubToSubus.
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ if (TLI.isOperationLegal(Opcode, VT) &&
+ (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
return TruncateArithmetic(Op0, Op1);
break;
}
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll?rev=340212&r1=340211&r2=340212&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll Mon Aug 20 13:57:35 2018
@@ -1324,79 +1324,13 @@ define <16 x i8> @trunc_sub_v16i16_v16i8
define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; SSE-NEXT: psubw %xmm4, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT: psubw %xmm1, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm3, %xmm0
+; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: trunc_ext_sub_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ext_sub_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_ext_sub_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_ext_sub_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_ext_sub_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = zext <16 x i8> %x to <16 x i16>
%b = zext <16 x i8> %y to <16 x i16>
%c = sub <16 x i16> %a, %b
@@ -1856,69 +1790,13 @@ define <16 x i8> @trunc_sub_const_v16i16
define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
-; SSE-NEXT: psubw {{.*}}(%rip), %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
%a = zext <16 x i8> %x to <16 x i16>
%b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%c = trunc <16 x i16> %b to <16 x i8>
@@ -1928,78 +1806,16 @@ define <16 x i8> @trunc_ext_sub_const_rh
define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
-; SSE-NEXT: psubw %xmm0, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [8,9,10,11,12,13,14,15]
-; SSE-NEXT: psubw %xmm2, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: packuswb %xmm0, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSE-NEXT: psubb %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,12,13,14,15]
-; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpsubw %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpsubw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpsubw %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%a = zext <16 x i8> %x to <16 x i16>
%b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
%c = trunc <16 x i16> %b to <16 x i8>
More information about the llvm-commits
mailing list