[llvm-commits] [llvm] r148225 - in /llvm/trunk: lib/CodeGen/SelectionDAG/TargetLowering.cpp lib/Target/X86/X86ISelLowering.cpp test/CodeGen/X86/blend-msb.ll

Sun Jan 15 21:26:03 PST 2012

On Sun, Jan 15, 2012 at 7:01 PM, David Blaikie <dblaikie at gmail.com> wrote:
> On Sun, Jan 15, 2012 at 11:27 AM, Nadav Rotem <nadav.rotem at intel.com> wrote:
>> Author: nadav
>> Date: Sun Jan 15 13:27:55 2012
>> New Revision: 148225
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=148225&view=rev
>> Log:
>> [AVX] Optimize x86 VSELECT instructions using SimplifyDemandedBits.
>> We know that the blend instructions only use the MSB, so if the mask is
>> sign-extended then we can convert it into a SHL instruction. This is a
>> common pattern because the type-legalizer sign-extends the i1 type which
>> is used by the LLVM-IR for the condition.
>>
>> Added a new optimization in SimplifyDemandedBits for SIGN_EXTEND_INREG -> SHL.
>>
>>
>> Added:
>>    llvm/trunk/test/CodeGen/X86/blend-msb.ll
>> Modified:
>>    llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
>>    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>>
>> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=148225&r1=148224&r2=148225&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
>> +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Sun Jan 15 13:27:55 2012
>> @@ -1608,23 +1608,40 @@
>>     }
>>     break;
>>   case ISD::SIGN_EXTEND_INREG: {
>> -    EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
>> +    EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
>> +
>> +    APInt MsbMask = APInt::getHighBitsSet(BitWidth, 1);
>> +    // If we only care about the highest bit, don't bother shifting right.
>> +    if (MsbMask ==  DemandedMask) {
>> +      unsigned ShAmt = ExVT.getScalarType().getSizeInBits();
>> +      SDValue InOp = Op.getOperand(0);
>> +      EVT InVT = Op.getOperand(0).getValueType();
>> +      EVT ShTy = getShiftAmountTy(InVT);
>
> ShTy is unused (& thus fails to compile under -Werror (-Wall is
> already on in the LLVM build)) - did you mean to use it, or remove it?
>
> - David

I removed the two unused variables (inVT and ShTy) in r148230 so LLVM
would build warning-free. Feel free to add them back (with usage) if
they were meant to have some use.

- David

>> +      // In this code we may handle vector types. We can't use the
>> +      // getShiftAmountTy API because it only works on scalars.
>> +      // We use the shift value type because we know that its an integer
>> +      // with enough bits.
>> +      SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ShAmt,
>> +                                             Op.getValueType());
>> +      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
>> +                                            Op.getValueType(), InOp, ShiftAmt));
>> +    }
>>
>>     // Sign extension.  Compute the demanded bits in the result that are not
>>     // present in the input.
>>     APInt NewBits =
>>       APInt::getHighBitsSet(BitWidth,
>> -                            BitWidth - EVT.getScalarType().getSizeInBits());
>> +                            BitWidth - ExVT.getScalarType().getSizeInBits());
>>
>>     // If none of the extended bits are demanded, eliminate the sextinreg.
>>     if ((NewBits & NewMask) == 0)
>>       return TLO.CombineTo(Op, Op.getOperand(0));
>>
>>     APInt InSignBit =
>> -      APInt::getSignBit(EVT.getScalarType().getSizeInBits()).zext(BitWidth);
>> +      APInt::getSignBit(ExVT.getScalarType().getSizeInBits()).zext(BitWidth);
>>     APInt InputDemandedBits =
>>       APInt::getLowBitsSet(BitWidth,
>> -                           EVT.getScalarType().getSizeInBits()) &
>> +                           ExVT.getScalarType().getSizeInBits()) &
>>       NewMask;
>>
>>     // Since the sign extended bits are demanded, we know that the sign
>> @@ -1642,7 +1659,7 @@
>>     // If the input sign bit is known zero, convert this into a zero extension.
>>     if (KnownZero.intersects(InSignBit))
>>       return TLO.CombineTo(Op,
>> -                           TLO.DAG.getZeroExtendInReg(Op.getOperand(0),dl,EVT));
>> +                          TLO.DAG.getZeroExtendInReg(Op.getOperand(0),dl,ExVT));
>>
>>     if (KnownOne.intersects(InSignBit)) {    // Input sign bit known set
>>       KnownOne |= NewBits;
>>
>> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=148225&r1=148224&r2=148225&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
>> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Jan 15 13:27:55 2012
>> @@ -12868,6 +12868,7 @@
>>  /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
>>  /// nodes.
>>  static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
>> +                                    TargetLowering::DAGCombinerInfo &DCI,
>>                                     const X86Subtarget *Subtarget) {
>>   DebugLoc DL = N->getDebugLoc();
>>   SDValue Cond = N->getOperand(0);
>> @@ -13144,6 +13145,26 @@
>>     }
>>   }
>>
>> +  // If we know that this node is legal then we know that it is going to be
>> +  // matched by one of the SSE/AVX BLEND instructions. These instructions only
>> +  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
>> +  // to simplify previous instructions.
>> +  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
>> +  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
>> +      !DCI.isBeforeLegalize() &&
>> +      TLI.isOperationLegal(ISD::VSELECT, VT)) {
>> +    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
>> +    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
>> +    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
>> +
>> +    APInt KnownZero, KnownOne;
>> +    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
>> +                                          DCI.isBeforeLegalizeOps());
>> +    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
>> +        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
>> +      DCI.CommitTargetLoweringOpt(TLO);
>> +  }
>> +
>>   return SDValue();
>>  }
>>
>> @@ -14609,7 +14630,7 @@
>>   case ISD::EXTRACT_VECTOR_ELT:
>>     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
>>   case ISD::VSELECT:
>> -  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
>> +  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
>>   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
>>   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
>>   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
>>
>> Added: llvm/trunk/test/CodeGen/X86/blend-msb.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/blend-msb.ll?rev=148225&view=auto
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/blend-msb.ll (added)
>> +++ llvm/trunk/test/CodeGen/X86/blend-msb.ll Sun Jan 15 13:27:55 2012
>> @@ -0,0 +1,37 @@
>> +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
>> +
>> +
>> +; In this test we check that sign-extend of the mask bit is performed by
>> +; shifting the needed bit to the MSB, and not using shl+sra.
>> +
>> +;CHECK: vsel_float
>> +;CHECK: pslld
>> +;CHECK-NEXT: blendvps
>> +;CHECK: ret
>> +define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
>> +  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
>> +  ret <4 x float> %vsel
>> +}
>> +
>> +;CHECK: vsel_4xi8
>> +;CHECK: pslld
>> +;CHECK-NEXT: blendvps
>> +;CHECK: ret
>> +define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
>> +  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
>> +  ret <4 x i8> %vsel
>> +}
>> +
>> +
>> +; We do not have native support for v8i16 blends and we have to use the
>> +; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not r
>> +; reduce the mask in this case.
>> +;CHECK: vsel_8xi16
>> +;CHECK: psllw
>> +;CHECK: psraw
>> +;CHECK: pblendvb
>> +;CHECK: ret
>> +define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
>> +  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
>> +  ret <8 x i16> %vsel
>> +}
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits