[llvm] r326308 - [X86] Don't use EXTRACT_ELEMENT from v1i1 with i8/i32 result type when we need to guarantee zeroes in the upper bits of return.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 28 00:14:28 PST 2018
Author: ctopper
Date: Wed Feb 28 00:14:28 2018
New Revision: 326308
URL: http://llvm.org/viewvc/llvm-project?rev=326308&view=rev
Log:
[X86] Don't use EXTRACT_ELEMENT from v1i1 with i8/i32 result type when we need to guarantee zeroes in the upper bits of return.
An extract_element where the result type is larger than the scalar element type is semantically an any_extend of from the scalar element type to the result type. If we expect zeroes in the upper bits of the i8/i32 we need to mae sure those zeroes are explicit in the DAG.
For these cases the best way to accomplish this is use an insert_subvector to pad zeroes to the upper bits of the v1i1 first. We extend to either v16i1(for i32) or v8i1(for i8). Then bitcast that to a scalar and finish with a zero_extend up to i32 if necessary. We can't extend past v16i1 because that's the largest mask size on KNL. But isel is smarter enough to know that a zext of a bitcast from v16i1 to i16 can use a KMOVW instruction. The insert_subvectors will be dropped during isel because we can determine that the producing instruction already zeroed the upper bits of the k-register.
Modified:
llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86InstrVecCompiler.td
llvm/trunk/test/CodeGen/X86/avx512-cmp.ll
llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
llvm/trunk/test/CodeGen/X86/gpr-to-mask.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp?rev=326308&r1=326307&r2=326308&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp Wed Feb 28 00:14:28 2018
@@ -457,7 +457,7 @@ namespace {
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
unsigned Opcode = N->getOpcode();
if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMU ||
- Opcode == X86ISD::CMPM_RND) {
+ Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
@@ -467,6 +467,10 @@ static bool isLegalMaskCompare(SDNode *N
return true;
}
+ // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
+ if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
+ Opcode == X86ISD::FSETCCM_RND)
+ return true;
return false;
}
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=326308&r1=326307&r2=326308&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Feb 28 00:14:28 2018
@@ -19948,6 +19948,7 @@ static SDValue getScalarMaskingNode(SDVa
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
if (Op.getOpcode() == X86ISD::FSETCCM ||
Op.getOpcode() == X86ISD::FSETCCM_RND ||
@@ -20417,9 +20418,11 @@ SDValue X86TargetLowering::LowerINTRINSI
SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
Subtarget, DAG);
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits in the v2i1/v4i1 case.
SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
- DAG.getUNDEF(BitcastVT), FPclassMask,
- DAG.getIntPtrConstant(0, dl));
+ DAG.getConstant(0, dl, BitcastVT),
+ FPclassMask, DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
case FPCLASSS: {
@@ -20429,8 +20432,12 @@ SDValue X86TargetLowering::LowerINTRINSI
SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
Subtarget, DAG);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
- DAG.getIntPtrConstant(0, dl));
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+ DAG.getConstant(0, dl, MVT::v8i1),
+ FPclassMask, DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(MVT::i8, Ins);
}
case CMP_MASK: {
// Comparison intrinsics with masks.
@@ -20438,7 +20445,7 @@ SDValue X86TargetLowering::LowerINTRINSI
// (i8 (int_x86_avx512_mask_pcmpeq_q_128
// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
// (i8 (bitcast
- // (v8i1 (insert_subvector undef,
+ // (v8i1 (insert_subvector zero,
// (v2i1 (and (PCMPEQM %a, %b),
// (extract_subvector
// (v8i1 (bitcast %mask)), 0))), 0))))
@@ -20451,9 +20458,11 @@ SDValue X86TargetLowering::LowerINTRINSI
Op.getOperand(2));
SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
Subtarget, DAG);
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits in the v2i1/v4i1 case.
SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
- DAG.getUNDEF(BitcastVT), CmpMask,
- DAG.getIntPtrConstant(0, dl));
+ DAG.getConstant(0, dl, BitcastVT),
+ CmpMask, DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
@@ -20497,8 +20506,12 @@ SDValue X86TargetLowering::LowerINTRINSI
SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
Subtarget, DAG);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
- DAG.getIntPtrConstant(0, dl));
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+ DAG.getConstant(0, dl, MVT::v8i1),
+ CmpMask, DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(MVT::i8, Ins);
}
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
@@ -20551,8 +20564,13 @@ SDValue X86TargetLowering::LowerINTRINSI
else
FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8), Sae);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
- DAG.getIntPtrConstant(0, dl));
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
+ DAG.getConstant(0, dl, MVT::v16i1),
+ FCmp, DAG.getIntPtrConstant(0, dl));
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
+ DAG.getBitcast(MVT::i16, Ins));
}
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
@@ -33382,9 +33400,13 @@ static SDValue combineCompareEqual(SDNod
SDValue FSetCC =
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
DAG.getConstant(x86cc, DL, MVT::i8));
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
- N->getSimpleValueType(0), FSetCC,
- DAG.getIntPtrConstant(0, DL));
+ // Need to fill with zeros to ensure the bitcast will produce zeroes
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
+ DAG.getConstant(0, DL, MVT::v16i1),
+ FSetCC, DAG.getIntPtrConstant(0, DL));
+ return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
+ N->getSimpleValueType(0));
}
SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
CMP00.getValueType(), CMP00, CMP01,
Modified: llvm/trunk/lib/Target/X86/X86InstrVecCompiler.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrVecCompiler.td?rev=326308&r1=326307&r2=326308&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrVecCompiler.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrVecCompiler.td Wed Feb 28 00:14:28 2018
@@ -427,6 +427,7 @@ class maskzeroupper<ValueType vt, Regist
return isMaskZeroExtended(N);
}]>;
+def maskzeroupperv1i1 : maskzeroupper<v1i1, VK1>;
def maskzeroupperv2i1 : maskzeroupper<v2i1, VK2>;
def maskzeroupperv4i1 : maskzeroupper<v4i1, VK4>;
def maskzeroupperv8i1 : maskzeroupper<v8i1, VK8>;
@@ -438,11 +439,18 @@ def maskzeroupperv32i1 : maskzeroupper<v
// zeroing.
let Predicates = [HasBWI] in {
def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK32)>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
maskzeroupperv8i1:$src, (iPTR 0))),
(COPY_TO_REGCLASS VK8:$src, VK32)>;
def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
maskzeroupperv16i1:$src, (iPTR 0))),
(COPY_TO_REGCLASS VK16:$src, VK32)>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK64)>;
def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
maskzeroupperv8i1:$src, (iPTR 0))),
(COPY_TO_REGCLASS VK8:$src, VK64)>;
@@ -456,10 +464,19 @@ let Predicates = [HasBWI] in {
let Predicates = [HasAVX512] in {
def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK16)>;
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
maskzeroupperv8i1:$src, (iPTR 0))),
(COPY_TO_REGCLASS VK8:$src, VK16)>;
}
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ maskzeroupperv1i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK1:$src, VK8)>;
+}
+
let Predicates = [HasVLX, HasDQI] in {
def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
maskzeroupperv2i1:$src, (iPTR 0))),
Modified: llvm/trunk/test/CodeGen/X86/avx512-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cmp.ll?rev=326308&r1=326307&r2=326308&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cmp.ll Wed Feb 28 00:14:28 2018
@@ -48,19 +48,11 @@ l2:
}
define i32 @test3(float %a, float %b) {
-; KNL-LABEL: test3:
-; KNL: ## %bb.0:
-; KNL-NEXT: vcmpeqss %xmm1, %xmm0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test3:
-; SKX: ## %bb.0:
-; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0
-; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: movzbl %al, %eax
-; SKX-NEXT: retq
+; ALL-LABEL: test3:
+; ALL: ## %bb.0:
+; ALL-NEXT: vcmpeqss %xmm1, %xmm0, %k0
+; ALL-NEXT: kmovw %k0, %eax
+; ALL-NEXT: retq
%cmp10.i = fcmp oeq float %a, %b
%conv11.i = zext i1 %cmp10.i to i32
Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=326308&r1=326307&r2=326308&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Wed Feb 28 00:14:28 2018
@@ -3186,17 +3186,17 @@ define i8 at test_int_x86_avx512_mask_cmp_s
; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcmplesd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %edx
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edx
+; CHECK-NEXT: kmovw %k0, %esi
; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: orl %edx, %eax
-; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: orb %cl, %dl
+; CHECK-NEXT: orb %sil, %al
+; CHECK-NEXT: orb %dl, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
@@ -3231,17 +3231,17 @@ define i8 at test_int_x86_avx512_mask_cmp_s
; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpless %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: andl %eax, %ecx
+; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %edx
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edx
+; CHECK-NEXT: kmovw %k0, %esi
; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl %edx, %eax
-; CHECK-NEXT: andl %ecx, %eax
+; CHECK-NEXT: andb %cl, %dl
+; CHECK-NEXT: andb %sil, %al
+; CHECK-NEXT: andb %dl, %al
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
Modified: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-schedule.ll?rev=326308&r1=326307&r2=326308&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-schedule.ll Wed Feb 28 00:14:28 2018
@@ -1124,15 +1124,13 @@ define i32 @test3(float %a, float %b) {
; GENERIC-LABEL: test3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00]
-; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
-; GENERIC-NEXT: movzbl %al, %eax # sched: [1:0.33]
+; GENERIC-NEXT: kmovw %k0, %eax # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test3:
; SKX: # %bb.0:
; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00]
-; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
-; SKX-NEXT: movzbl %al, %eax # sched: [1:0.25]
+; SKX-NEXT: kmovw %k0, %eax # sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%cmp10.i = fcmp oeq float %a, %b
Modified: llvm/trunk/test/CodeGen/X86/gpr-to-mask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/gpr-to-mask.ll?rev=326308&r1=326307&r2=326308&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/gpr-to-mask.ll (original)
+++ llvm/trunk/test/CodeGen/X86/gpr-to-mask.ll Wed Feb 28 00:14:28 2018
@@ -60,13 +60,11 @@ define void @test_fcmp_storei1(i1 %cond,
; X86-64-NEXT: je .LBB1_2
; X86-64-NEXT: # %bb.1: # %if
; X86-64-NEXT: vcmpeqss %xmm1, %xmm0, %k0
-; X86-64-NEXT: jmp .LBB1_3
+; X86-64-NEXT: kmovb %k0, (%rdx)
+; X86-64-NEXT: retq
; X86-64-NEXT: .LBB1_2: # %else
; X86-64-NEXT: vcmpeqss %xmm3, %xmm2, %k0
-; X86-64-NEXT: .LBB1_3: # %exit
-; X86-64-NEXT: kmovd %k0, %eax
-; X86-64-NEXT: andb $1, %al
-; X86-64-NEXT: movb %al, (%rdx)
+; X86-64-NEXT: kmovb %k0, (%rdx)
; X86-64-NEXT: retq
;
; X86-32-LABEL: test_fcmp_storei1:
@@ -77,14 +75,12 @@ define void @test_fcmp_storei1(i1 %cond,
; X86-32-NEXT: # %bb.1: # %if
; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm0, %k0
-; X86-32-NEXT: jmp .LBB1_3
+; X86-32-NEXT: kmovb %k0, (%eax)
+; X86-32-NEXT: retl
; X86-32-NEXT: .LBB1_2: # %else
; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm0, %k0
-; X86-32-NEXT: .LBB1_3: # %exit
-; X86-32-NEXT: kmovd %k0, %ecx
-; X86-32-NEXT: andb $1, %cl
-; X86-32-NEXT: movb %cl, (%eax)
+; X86-32-NEXT: kmovb %k0, (%eax)
; X86-32-NEXT: retl
entry:
br i1 %cond, label %if, label %else
More information about the llvm-commits
mailing list