[llvm-commits] [llvm] r171148 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp test/CodeGen/X86/v8i1-masks.ll

Wed Jan 2 09:46:08 PST 2013

Hi Elena, 

I fixed the bug you reported in r171398. 

I don't understand your suggestion. If we extract sub vector we would only take a part of the mask. Did you mean something else ? 

Thanks, 
Nadav

On Jan 2, 2013, at 4:06 AM, "Demikhovsky, Elena" <elena.demikhovsky at intel.com> wrote:

> I propose to replace the sequence:
> setcc(vector) - > truncate
> 
> with
> 
> setcc(vector) - > EXTRACT_SUBVECTOR -> bitcast
> 
> 
> static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
>                                      TargetLowering::DAGCombinerInfo &DCI,
>                                      const X86Subtarget *Subtarget)  {
>  EVT VT = N->getValueType(0);
>  if (DCI.isBeforeLegalize() || !VT.isVector())
>    return SDValue();
> 
>  SDValue In = N->getOperand(0);
>  if (In.getOpcode() == ISD::SETCC) {
>    DebugLoc DL = N->getDebugLoc();
>    EVT InVT = In.getValueType();
> 
>    // The vector is all ones or all zero. Just take a half of it.
>    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
>                                InVT.getVectorNumElements()/2);
>    SDValue HalfVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, In,
>                               DAG.getIntPtrConstant(0));
>    assert(HalfVT.getSizeInBits() == VT.getSizeInBits());
>    return DAG.getNode(ISD::BITCAST, DL, VT, HalfVec);
>  }
>  return SDValue();
> }
> 
> - Elena
> 
> -----Original Message-----
> From: Demikhovsky, Elena 
> Sent: Wednesday, January 02, 2013 14:01
> To: 'Nadav Rotem'
> Cc: llvm-commits at cs.uiuc.edu
> Subject: RE: [llvm-commits] [llvm] r171148 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp test/CodeGen/X86/v8i1-masks.ll
> 
> Hi Nadav,
> 
> After this patch the generated code is wrong.
> 
> define <8 x i32> @test(<8 x float> %a, <8 x float> %b) {
>  %c1 = fadd <8 x float> %a, %b
>  %b1 = fmul <8 x float> %b, %a
>  %d  = fsub <8 x float> %b1, %c1
>  %res1 = fcmp olt <8 x float> %a, %b1
>  %res2 = fcmp olt <8 x float> %c1, %d
>  %andr = and <8 x i1>%res1, %res2
>  %ex = zext <8 x i1> %andr to <8 x i32>
>  ret <8 x i32>%ex
> }
> 
> W:\LLVM_org\build64\bin\Debug>w:\AVX3\build64\bin\Debug\llc.exe < x.ll -mtriple=x86_64-apple-darwin
>        .section        __TEXT,__text,regular,pure_instructions
>        .section        __TEXT,__const
>        .align  5
> LCPI0_0:
>        .long   1                       ## 0x1
>        .long   1                       ## 0x1
>        .long   1                       ## 0x1
>        .long   1                       ## 0x1
>        .long   1                       ## 0x1
>        .long   1                       ## 0x1
>        .long   1                       ## 0x1
>        .long   1                       ## 0x1
>        .section        __TEXT,__text,regular,pure_instructions
>        .globl  _test
>        .align  4, 0x90
> _test:                                  ## @test
>        .cfi_startproc
> ## BB#0:
>        vmulps  %ymm0, %ymm1, %ymm1
>        vcmpltps        %ymm1, %ymm0, %ymm0
>        vandps  LCPI0_0(%rip), %ymm0, %ymm0
>        ret
>        .cfi_endproc
> 
> - Elena
> 
> -----Original Message-----
> From: llvm-commits-bounces at cs.uiuc.edu [mailto:llvm-commits-bounces at cs.uiuc.edu] On Behalf Of Nadav Rotem
> Sent: Thursday, December 27, 2012 10:16
> To: llvm-commits at cs.uiuc.edu
> Subject: [llvm-commits] [llvm] r171148 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp test/CodeGen/X86/v8i1-masks.ll
> 
> Author: nadav
> Date: Thu Dec 27 02:15:45 2012
> New Revision: 171148
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=171148&view=rev
> Log:
> On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized register. In most cases we actually compare or select YMM-sized registers and mixing the two types creates horrible code. This commit optimizes some of the transition sequences.
> 
> PR14657.
> 
> Added:
>    llvm/trunk/test/CodeGen/X86/v8i1-masks.ll
> Modified:
>    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> 
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=171148&r1=171147&r2=171148&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Dec 27 02:15:45
> +++ 2012
> @@ -15731,9 +15731,92 @@
>   return false;
> }
> 
> +// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM 
> +sized // register. In most cases we actually compare or select 
> +YMM-sized registers // and mixing the two types creates horrible code.
> +This method optimizes // some of the transition sequences.
> +static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
> +                                 TargetLowering::DAGCombinerInfo &DCI,
> +                                 const X86Subtarget *Subtarget) {
> +  EVT VT = N->getValueType(0);
> +  if (VT.getSizeInBits() != 256)
> +    return SDValue();
> +
> +  assert((N->getOpcode() == ISD::ANY_EXTEND ||
> +          N->getOpcode() == ISD::ZERO_EXTEND ||
> +          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
> +
> +  SDValue Narrow = N->getOperand(0);
> +  EVT NarrowVT = Narrow->getValueType(0);  if (NarrowVT.getSizeInBits() 
> + != 128)
> +    return SDValue();
> +
> +  if (Narrow->getOpcode() != ISD::XOR &&
> +      Narrow->getOpcode() != ISD::AND &&
> +      Narrow->getOpcode() != ISD::OR)
> +    return SDValue();
> +
> +  SDValue N0  = Narrow->getOperand(0);
> +  SDValue N1  = Narrow->getOperand(1);
> +  DebugLoc DL = Narrow->getDebugLoc();
> +
> +  // The Left side has to be a trunc.
> +  if (N0.getOpcode() != ISD::TRUNCATE)
> +    return SDValue();
> +
> +  // The type of the truncated inputs.
> +  EVT WideVT = N0->getOperand(0)->getValueType(0);
> +  if (WideVT != VT)
> +    return SDValue();
> +
> +  // The right side has to be a 'trunc' or a constant vector.
> +  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;  bool RHSConst =
> + (isSplatVector(N1.getNode()) &&
> +                   isa<ConstantSDNode>(N1->getOperand(0)));
> +  if (!RHSTrunc && !RHSConst)
> +    return SDValue();
> +
> +  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> +
> +  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
> +    return SDValue();
> +
> +  // Set N0 and N1 to hold the inputs to the new wide operation.
> +  N0 = N0->getOperand(0);
> +  if (RHSConst) {
> +    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
> +                     N1->getOperand(0));
> +    SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
> +    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); } 
> + else if (RHSTrunc) {
> +    N1 = N1->getOperand(0);
> +  }
> +
> +  // Generate the wide operation.
> +  SDValue Op = DAG.getNode(N->getOpcode(), DL, WideVT, N0, N1);
> +  unsigned Opcode = N->getOpcode();
> +  switch (Opcode) {
> +  case ISD::ANY_EXTEND:
> +    return Op;
> +  case ISD::ZERO_EXTEND: {
> +    unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
> +    APInt Mask = APInt::getAllOnesValue(InBits);
> +    Mask = Mask.zext(VT.getScalarType().getSizeInBits());
> +    return DAG.getNode(ISD::AND, DL, VT,
> +                       Op, DAG.getConstant(Mask, VT));
> +  }
> +  case ISD::SIGN_EXTEND:
> +    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
> +                       Op, DAG.getValueType(NarrowVT));
> +  default:
> +    llvm_unreachable("Unexpected opcode");
> +  }
> +}
> +
> static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
>                                  TargetLowering::DAGCombinerInfo &DCI,
>                                  const X86Subtarget *Subtarget) {
> +  EVT VT = N->getValueType(0);
>   if (DCI.isBeforeLegalizeOps())
>     return SDValue();
> 
> @@ -15741,8 +15824,6 @@
>   if (R.getNode())
>     return R;
> 
> -  EVT VT = N->getValueType(0);
> -
>   // Create BLSI, and BLSR instructions
>   // BLSI is X & (-X)
>   // BLSR is X & (X-1)
> @@ -15803,6 +15884,7 @@
> static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
>                                 TargetLowering::DAGCombinerInfo &DCI,
>                                 const X86Subtarget *Subtarget) {
> +  EVT VT = N->getValueType(0);
>   if (DCI.isBeforeLegalizeOps())
>     return SDValue();
> 
> @@ -15810,8 +15892,6 @@
>   if (R.getNode())
>     return R;
> 
> -  EVT VT = N->getValueType(0);
> -
>   SDValue N0 = N->getOperand(0);
>   SDValue N1 = N->getOperand(1);
> 
> @@ -15991,6 +16071,7 @@
> static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
>                                  TargetLowering::DAGCombinerInfo &DCI,
>                                  const X86Subtarget *Subtarget) {
> +  EVT VT = N->getValueType(0);
>   if (DCI.isBeforeLegalizeOps())
>     return SDValue();
> 
> @@ -16004,8 +16085,6 @@
>   if (!Subtarget->hasBMI())
>     return SDValue();
> 
> -  EVT VT = N->getValueType(0);
> -
>   if (VT != MVT::i32 && VT != MVT::i64)
>     return SDValue();
> 
> @@ -16671,6 +16750,12 @@
>   EVT OpVT = Op.getValueType();
>   DebugLoc dl = N->getDebugLoc();
> 
> +  if (VT.isVector() && VT.getSizeInBits() == 256) {
> +    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
> +    if (R.getNode())
> +      return R;
> +  }
> +
>   if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
>       (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
> 
> @@ -16768,15 +16853,21 @@
>       N0.hasOneUse() &&
>       N0.getOperand(0).hasOneUse()) {
>     SDValue N00 = N0.getOperand(0);
> -    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
> -      return SDValue();
> -    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
> -    if (!C || C->getZExtValue() != 1)
> -      return SDValue();
> -    return DAG.getNode(ISD::AND, dl, VT,
> -                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
> -                                   N00.getOperand(0), N00.getOperand(1)),
> -                       DAG.getConstant(1, VT));
> +    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
> +      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
> +      if (!C || C->getZExtValue() != 1)
> +        return SDValue();
> +      return DAG.getNode(ISD::AND, dl, VT,
> +                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
> +                                     N00.getOperand(0), N00.getOperand(1)),
> +                         DAG.getConstant(1, VT));
> +    }
> +  }
> +
> +  if (VT.isVector() && VT.getSizeInBits() == 256) {
> +    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
> +    if (R.getNode())
> +      return R;
>   }
> 
>   // Optimize vectors in AVX mode:
> 
> Added: llvm/trunk/test/CodeGen/X86/v8i1-masks.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/v8i1-masks.ll?rev=171148&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/v8i1-masks.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/v8i1-masks.ll Thu Dec 27 02:15:45 2012
> @@ -0,0 +1,38 @@
> +; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx 
> +-o - < %s | FileCheck %s
> +
> +;CHECK: and_masks
> +;CHECK: vmovups
> +;CHECK-NEXT: vcmpltp
> +;CHECK-NEXT: vandps
> +;CHECK-NEXT: vmovups
> +;CHECK: ret
> +
> +define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>*
> +%c) nounwind uwtable noinline ssp {
> +  %v0 = load <8 x float>* %a, align 16
> +  %v1 = load <8 x float>* %b, align 16
> +  %m0 = fcmp olt <8 x float> %v1, %v0
> +  %v2 = load <8 x float>* %c, align 16
> +  %m1 = fcmp olt <8 x float> %v2, %v0
> +  %mand = and <8 x i1> %m1, %m0
> +  %r = zext <8 x i1> %mand to <8 x i32>
> +  store <8 x i32> %r, <8 x i32>* undef, align 16
> +  ret void
> +}
> +
> +;CHECK: neg_mask
> +;CHECK:  vmovups
> +;CHECK-NEXT: vcmpltps
> +;CHECK-NEXT: vandps
> +;CHECK-NEXT: vmovups
> +;CHECK: ret
> +
> +define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>*
> +%c) nounwind uwtable noinline ssp {
> +  %v0 = load <8 x float>* %a, align 16
> +  %v1 = load <8 x float>* %b, align 16
> +  %m0 = fcmp olt <8 x float> %v1, %v0
> +  %mand = xor <8 x i1> %m0, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1,
> +i1 1>
> +  %r = zext <8 x i1> %mand to <8 x i32>
> +  store <8 x i32> %r, <8 x i32>* undef, align 16
> +  ret void
> +}
> +
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> ---------------------------------------------------------------------
> Intel Israel (74) Limited
> 
> This e-mail and any attachments may contain confidential material for
> the sole use of the intended recipient(s). Any review or distribution
> by others is strictly prohibited. If you are not the intended
> recipient, please contact the sender and delete all copies.
>