[llvm-commits] [llvm] r171148 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp test/CodeGen/X86/v8i1-masks.ll
Nadav Rotem
nrotem at apple.com
Wed Jan 2 09:46:08 PST 2013
Hi Elena,
I fixed the bug you reported in r171398.
I don't understand your suggestion. If we extract sub vector we would only take a part of the mask. Did you mean something else ?
Thanks,
Nadav
On Jan 2, 2013, at 4:06 AM, "Demikhovsky, Elena" <elena.demikhovsky at intel.com> wrote:
> I propose to replace the sequence:
> setcc(vector) - > truncate
>
> with
>
> setcc(vector) - > EXTRACT_SUBVECTOR -> bitcast
>
>
> static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
> TargetLowering::DAGCombinerInfo &DCI,
> const X86Subtarget *Subtarget) {
> EVT VT = N->getValueType(0);
> if (DCI.isBeforeLegalize() || !VT.isVector())
> return SDValue();
>
> SDValue In = N->getOperand(0);
> if (In.getOpcode() == ISD::SETCC) {
> DebugLoc DL = N->getDebugLoc();
> EVT InVT = In.getValueType();
>
> // The vector is all ones or all zero. Just take a half of it.
> EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
> InVT.getVectorNumElements()/2);
> SDValue HalfVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, In,
> DAG.getIntPtrConstant(0));
> assert(HalfVT.getSizeInBits() == VT.getSizeInBits());
> return DAG.getNode(ISD::BITCAST, DL, VT, HalfVec);
> }
> return SDValue();
> }
>
> - Elena
>
> -----Original Message-----
> From: Demikhovsky, Elena
> Sent: Wednesday, January 02, 2013 14:01
> To: 'Nadav Rotem'
> Cc: llvm-commits at cs.uiuc.edu
> Subject: RE: [llvm-commits] [llvm] r171148 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp test/CodeGen/X86/v8i1-masks.ll
>
> Hi Nadav,
>
> After this patch the generated code is wrong.
>
> define <8 x i32> @test(<8 x float> %a, <8 x float> %b) {
> %c1 = fadd <8 x float> %a, %b
> %b1 = fmul <8 x float> %b, %a
> %d = fsub <8 x float> %b1, %c1
> %res1 = fcmp olt <8 x float> %a, %b1
> %res2 = fcmp olt <8 x float> %c1, %d
> %andr = and <8 x i1>%res1, %res2
> %ex = zext <8 x i1> %andr to <8 x i32>
> ret <8 x i32>%ex
> }
>
> W:\LLVM_org\build64\bin\Debug>w:\AVX3\build64\bin\Debug\llc.exe < x.ll -mtriple=x86_64-apple-darwin
> .section __TEXT,__text,regular,pure_instructions
> .section __TEXT,__const
> .align 5
> LCPI0_0:
> .long 1 ## 0x1
> .long 1 ## 0x1
> .long 1 ## 0x1
> .long 1 ## 0x1
> .long 1 ## 0x1
> .long 1 ## 0x1
> .long 1 ## 0x1
> .long 1 ## 0x1
> .section __TEXT,__text,regular,pure_instructions
> .globl _test
> .align 4, 0x90
> _test: ## @test
> .cfi_startproc
> ## BB#0:
> vmulps %ymm0, %ymm1, %ymm1
> vcmpltps %ymm1, %ymm0, %ymm0
> vandps LCPI0_0(%rip), %ymm0, %ymm0
> ret
> .cfi_endproc
>
> - Elena
>
> -----Original Message-----
> From: llvm-commits-bounces at cs.uiuc.edu [mailto:llvm-commits-bounces at cs.uiuc.edu] On Behalf Of Nadav Rotem
> Sent: Thursday, December 27, 2012 10:16
> To: llvm-commits at cs.uiuc.edu
> Subject: [llvm-commits] [llvm] r171148 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp test/CodeGen/X86/v8i1-masks.ll
>
> Author: nadav
> Date: Thu Dec 27 02:15:45 2012
> New Revision: 171148
>
> URL: http://llvm.org/viewvc/llvm-project?rev=171148&view=rev
> Log:
> On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized register. In most cases we actually compare or select YMM-sized registers and mixing the two types creates horrible code. This commit optimizes some of the transition sequences.
>
> PR14657.
>
> Added:
> llvm/trunk/test/CodeGen/X86/v8i1-masks.ll
> Modified:
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=171148&r1=171147&r2=171148&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Dec 27 02:15:45
> +++ 2012
> @@ -15731,9 +15731,92 @@
> return false;
> }
>
> +// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM
> +sized // register. In most cases we actually compare or select
> +YMM-sized registers // and mixing the two types creates horrible code.
> +This method optimizes // some of the transition sequences.
> +static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
> + TargetLowering::DAGCombinerInfo &DCI,
> + const X86Subtarget *Subtarget) {
> + EVT VT = N->getValueType(0);
> + if (VT.getSizeInBits() != 256)
> + return SDValue();
> +
> + assert((N->getOpcode() == ISD::ANY_EXTEND ||
> + N->getOpcode() == ISD::ZERO_EXTEND ||
> + N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
> +
> + SDValue Narrow = N->getOperand(0);
> + EVT NarrowVT = Narrow->getValueType(0); if (NarrowVT.getSizeInBits()
> + != 128)
> + return SDValue();
> +
> + if (Narrow->getOpcode() != ISD::XOR &&
> + Narrow->getOpcode() != ISD::AND &&
> + Narrow->getOpcode() != ISD::OR)
> + return SDValue();
> +
> + SDValue N0 = Narrow->getOperand(0);
> + SDValue N1 = Narrow->getOperand(1);
> + DebugLoc DL = Narrow->getDebugLoc();
> +
> + // The Left side has to be a trunc.
> + if (N0.getOpcode() != ISD::TRUNCATE)
> + return SDValue();
> +
> + // The type of the truncated inputs.
> + EVT WideVT = N0->getOperand(0)->getValueType(0);
> + if (WideVT != VT)
> + return SDValue();
> +
> + // The right side has to be a 'trunc' or a constant vector.
> + bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; bool RHSConst =
> + (isSplatVector(N1.getNode()) &&
> + isa<ConstantSDNode>(N1->getOperand(0)));
> + if (!RHSTrunc && !RHSConst)
> + return SDValue();
> +
> + const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> +
> + if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
> + return SDValue();
> +
> + // Set N0 and N1 to hold the inputs to the new wide operation.
> + N0 = N0->getOperand(0);
> + if (RHSConst) {
> + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
> + N1->getOperand(0));
> + SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
> + N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); }
> + else if (RHSTrunc) {
> + N1 = N1->getOperand(0);
> + }
> +
> + // Generate the wide operation.
> + SDValue Op = DAG.getNode(N->getOpcode(), DL, WideVT, N0, N1);
> + unsigned Opcode = N->getOpcode();
> + switch (Opcode) {
> + case ISD::ANY_EXTEND:
> + return Op;
> + case ISD::ZERO_EXTEND: {
> + unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
> + APInt Mask = APInt::getAllOnesValue(InBits);
> + Mask = Mask.zext(VT.getScalarType().getSizeInBits());
> + return DAG.getNode(ISD::AND, DL, VT,
> + Op, DAG.getConstant(Mask, VT));
> + }
> + case ISD::SIGN_EXTEND:
> + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
> + Op, DAG.getValueType(NarrowVT));
> + default:
> + llvm_unreachable("Unexpected opcode");
> + }
> +}
> +
> static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
> TargetLowering::DAGCombinerInfo &DCI,
> const X86Subtarget *Subtarget) {
> + EVT VT = N->getValueType(0);
> if (DCI.isBeforeLegalizeOps())
> return SDValue();
>
> @@ -15741,8 +15824,6 @@
> if (R.getNode())
> return R;
>
> - EVT VT = N->getValueType(0);
> -
> // Create BLSI, and BLSR instructions
> // BLSI is X & (-X)
> // BLSR is X & (X-1)
> @@ -15803,6 +15884,7 @@
> static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
> TargetLowering::DAGCombinerInfo &DCI,
> const X86Subtarget *Subtarget) {
> + EVT VT = N->getValueType(0);
> if (DCI.isBeforeLegalizeOps())
> return SDValue();
>
> @@ -15810,8 +15892,6 @@
> if (R.getNode())
> return R;
>
> - EVT VT = N->getValueType(0);
> -
> SDValue N0 = N->getOperand(0);
> SDValue N1 = N->getOperand(1);
>
> @@ -15991,6 +16071,7 @@
> static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
> TargetLowering::DAGCombinerInfo &DCI,
> const X86Subtarget *Subtarget) {
> + EVT VT = N->getValueType(0);
> if (DCI.isBeforeLegalizeOps())
> return SDValue();
>
> @@ -16004,8 +16085,6 @@
> if (!Subtarget->hasBMI())
> return SDValue();
>
> - EVT VT = N->getValueType(0);
> -
> if (VT != MVT::i32 && VT != MVT::i64)
> return SDValue();
>
> @@ -16671,6 +16750,12 @@
> EVT OpVT = Op.getValueType();
> DebugLoc dl = N->getDebugLoc();
>
> + if (VT.isVector() && VT.getSizeInBits() == 256) {
> + SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
> + if (R.getNode())
> + return R;
> + }
> +
> if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
> (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
>
> @@ -16768,15 +16853,21 @@
> N0.hasOneUse() &&
> N0.getOperand(0).hasOneUse()) {
> SDValue N00 = N0.getOperand(0);
> - if (N00.getOpcode() != X86ISD::SETCC_CARRY)
> - return SDValue();
> - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
> - if (!C || C->getZExtValue() != 1)
> - return SDValue();
> - return DAG.getNode(ISD::AND, dl, VT,
> - DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
> - N00.getOperand(0), N00.getOperand(1)),
> - DAG.getConstant(1, VT));
> + if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
> + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
> + if (!C || C->getZExtValue() != 1)
> + return SDValue();
> + return DAG.getNode(ISD::AND, dl, VT,
> + DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
> + N00.getOperand(0), N00.getOperand(1)),
> + DAG.getConstant(1, VT));
> + }
> + }
> +
> + if (VT.isVector() && VT.getSizeInBits() == 256) {
> + SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
> + if (R.getNode())
> + return R;
> }
>
> // Optimize vectors in AVX mode:
>
> Added: llvm/trunk/test/CodeGen/X86/v8i1-masks.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/v8i1-masks.ll?rev=171148&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/v8i1-masks.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/v8i1-masks.ll Thu Dec 27 02:15:45 2012
> @@ -0,0 +1,38 @@
> +; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx
> +-o - < %s | FileCheck %s
> +
> +;CHECK: and_masks
> +;CHECK: vmovups
> +;CHECK-NEXT: vcmpltp
> +;CHECK-NEXT: vandps
> +;CHECK-NEXT: vmovups
> +;CHECK: ret
> +
> +define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>*
> +%c) nounwind uwtable noinline ssp {
> + %v0 = load <8 x float>* %a, align 16
> + %v1 = load <8 x float>* %b, align 16
> + %m0 = fcmp olt <8 x float> %v1, %v0
> + %v2 = load <8 x float>* %c, align 16
> + %m1 = fcmp olt <8 x float> %v2, %v0
> + %mand = and <8 x i1> %m1, %m0
> + %r = zext <8 x i1> %mand to <8 x i32>
> + store <8 x i32> %r, <8 x i32>* undef, align 16
> + ret void
> +}
> +
> +;CHECK: neg_mask
> +;CHECK: vmovups
> +;CHECK-NEXT: vcmpltps
> +;CHECK-NEXT: vandps
> +;CHECK-NEXT: vmovups
> +;CHECK: ret
> +
> +define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>*
> +%c) nounwind uwtable noinline ssp {
> + %v0 = load <8 x float>* %a, align 16
> + %v1 = load <8 x float>* %b, align 16
> + %m0 = fcmp olt <8 x float> %v1, %v0
> + %mand = xor <8 x i1> %m0, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1,
> +i1 1>
> + %r = zext <8 x i1> %mand to <8 x i32>
> + store <8 x i32> %r, <8 x i32>* undef, align 16
> + ret void
> +}
> +
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> ---------------------------------------------------------------------
> Intel Israel (74) Limited
>
> This e-mail and any attachments may contain confidential material for
> the sole use of the intended recipient(s). Any review or distribution
> by others is strictly prohibited. If you are not the intended
> recipient, please contact the sender and delete all copies.
>
More information about the llvm-commits
mailing list