[llvm] r320481 - [X86] Recognize constant arrays with special values and replace loads from it with subtract and shift instructions, which then will be replaced by X86 BZHI machine instruction.

Wed Dec 13 18:44:31 PST 2017

Out of curiosity, why do this in the backend?  Replacing the load from 
such a constant array with the 2^(size-1)-1 sequence in IR would seem 
like a generally useful optimization.  Trading a load for a shift and 
subtract is clearly profitable.  Once that's visible in the IR, we could 
pattern match the BZHI pretty straight forwardly.

Philip

On 12/12/2017 06:13 AM, Ayman Musa via llvm-commits wrote:
> Author: aymanmus
> Date: Tue Dec 12 06:13:51 2017
> New Revision: 320481
>
> URL: http://llvm.org/viewvc/llvm-project?rev=320481&view=rev
> Log:
> [X86] Recognize constant arrays with special values and replace loads from it with subtract and shift instructions, which then will be replaced by X86 BZHI machine instruction.
>
> Recognize constant arrays with the following values:
>    0x0, 0x1, 0x3, 0x7, 0xF, 0x1F, .... , 2^(size - 1) -1
> where //size// is the size of the array.
>
> the result of a load with index //idx// from this array is equivalent to the result of the following:
>    (0xFFFFFFFF >> (sub 32, idx))             (assuming the array of type 32-bit integer).
>
> And the result of an 'AND' operation on the returned value of such a load and another input, is exactly equivalent to the X86 BZHI instruction behavior.
>
> See test cases in the LIT test for better understanding.
>
> Differential Revision: https://reviews.llvm.org/D34141
>
> Modified:
>      llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>      llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=320481&r1=320480&r2=320481&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Dec 12 06:13:51 2017
> @@ -33066,6 +33066,124 @@ static SDValue combineAndMaskToShift(SDN
>     return DAG.getBitcast(N->getValueType(0), Shift);
>   }
>   
> +// Get the index node from the lowered DAG of a GEP IR instruction with one
> +// indexing dimension.
> +static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
> +  if (Ld->isIndexed())
> +    return SDValue();
> +
> +  SDValue Base = Ld->getBasePtr();
> +
> +  if (Base.getOpcode() != ISD::ADD)
> +    return SDValue();
> +
> +  SDValue ShiftedIndex = Base.getOperand(0);
> +
> +  if (ShiftedIndex.getOpcode() != ISD::SHL)
> +    return SDValue();
> +
> +  return ShiftedIndex.getOperand(0);
> +
> +}
> +
> +static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
> +  if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
> +    switch (VT.getSizeInBits()) {
> +    default: return false;
> +    case 64: return Subtarget.is64Bit() ? true : false;
> +    case 32: return true;
> +    }
> +  }
> +  return false;
> +}
> +
> +// This function recognizes cases where X86 bzhi instruction can replace and
> +// 'and-load' sequence.
> +// In case of loading integer value from an array of constants which is defined
> +// as follows:
> +//
> +//   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
> +//
> +// then applying a bitwise and on the result with another input.
> +// It's equivalent to performing bzhi (zero high bits) on the input, with the
> +// same index of the load.
> +static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
> +    const X86Subtarget &Subtarget) {
> +  MVT VT = Node->getSimpleValueType(0);
> +  SDLoc dl(Node);
> +
> +  // Check if subtarget has BZHI instruction for the node's type
> +  if (!hasBZHI(Subtarget, VT))
> +    return SDValue();
> +
> +  // Try matching the pattern for both operands.
> +  for (unsigned i = 0; i < 2; i++) {
> +    SDValue N = Node->getOperand(i);
> +    LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
> +
> +     // continue if the operand is not a load instruction
> +    if (!Ld)
> +      return SDValue();
> +
> +    const Value *MemOp = Ld->getMemOperand()->getValue();
> +
> +    if (!MemOp)
> +      return SDValue();
> +
> +    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
> +      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
> +        if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
> +
> +          Constant *Init = GV->getInitializer();
> +          Type *Ty = Init->getType();
> +          if (!isa<ConstantDataArray>(Init) ||
> +              !Ty->getArrayElementType()->isIntegerTy() ||
> +              Ty->getArrayElementType()->getScalarSizeInBits() !=
> +                  VT.getSizeInBits() ||
> +              Ty->getArrayNumElements() >
> +                  Ty->getArrayElementType()->getScalarSizeInBits())
> +            continue;
> +
> +          // Check if the array's constant elements are suitable to our case.
> +          uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
> +          bool ConstantsMatch = true;
> +          for (uint64_t j = 0; j < ArrayElementCount; j++) {
> +            ConstantInt *Elem =
> +                dyn_cast<ConstantInt>(Init->getAggregateElement(j));
> +            if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
> +              ConstantsMatch = false;
> +              break;
> +            }
> +          }
> +          if (!ConstantsMatch)
> +            continue;
> +
> +          // Do the transformation (For 32-bit type):
> +          // -> (and (load arr[idx]), inp)
> +          // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
> +          //    that will be replaced with one bzhi instruction.
> +          SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
> +          SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
> +
> +          // Get the Node which indexes into the array.
> +          SDValue Index = getIndexFromUnindexedLoad(Ld);
> +          if (!Index)
> +            return SDValue();
> +          Index = DAG.getZExtOrTrunc(Index, dl, VT);
> +
> +          SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
> +
> +          SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
> +          SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
> +
> +          return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
> +        }
> +      }
> +    }
> +  }
> +  return SDValue();
> +}
> +
>   static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
>                             TargetLowering::DAGCombinerInfo &DCI,
>                             const X86Subtarget &Subtarget) {
> @@ -33094,6 +33212,9 @@ static SDValue combineAnd(SDNode *N, Sel
>     if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
>       return ShiftRight;
>   
> +  if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
> +    return R;
> +
>     // Attempt to recursively combine a bitmask AND with shuffles.
>     if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
>       SDValue Op(N, 0);
>
> Modified: llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll?rev=320481&r1=320480&r2=320481&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll Tue Dec 12 06:13:51 2017
> @@ -10,17 +10,14 @@
>   define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr {
>   ; CHECK-LABEL: f32_bzhi:
>   ; CHECK:       # %bb.0: # %entry
> -; CHECK-NEXT:    movslq %esi, %rax
> -; CHECK-NEXT:    andl fill_table32(,%rax,4), %edi
> -; CHECK-NEXT:    movl %edi, %eax
> -; CHECK-NEXT:    ret{{[l|q]}}
> +; CHECK-NEXT:    bzhil %esi, %edi, %eax
> +; CHECK-NEXT:    retq
>   ;
>   ; CHECK32-LABEL: f32_bzhi:
>   ; CHECK32:       # %bb.0: # %entry
>   ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
> -; CHECK32-NEXT:    movl fill_table32(,%eax,4), %eax
> -; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
> -; CHECK32-NEXT:    ret{{[l|q]}}
> +; CHECK32-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
> +; CHECK32-NEXT:    retl
>   entry:
>     %idxprom = sext i32 %y to i64
>     %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom
> @@ -32,17 +29,14 @@ entry:
>   define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr {
>   ; CHECK-LABEL: f32_bzhi_partial:
>   ; CHECK:       # %bb.0: # %entry
> -; CHECK-NEXT:    movslq %esi, %rax
> -; CHECK-NEXT:    andl fill_table32_partial(,%rax,4), %edi
> -; CHECK-NEXT:    movl %edi, %eax
> -; CHECK-NEXT:    ret{{[l|q]}}
> +; CHECK-NEXT:    bzhil %esi, %edi, %eax
> +; CHECK-NEXT:    retq
>   ;
>   ; CHECK32-LABEL: f32_bzhi_partial:
>   ; CHECK32:       # %bb.0: # %entry
>   ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
> -; CHECK32-NEXT:    movl fill_table32_partial(,%eax,4), %eax
> -; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
> -; CHECK32-NEXT:    ret{{[l|q]}}
> +; CHECK32-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
> +; CHECK32-NEXT:    retl
>   entry:
>     %idxprom = sext i32 %y to i64
>     %arrayidx = getelementptr inbounds [17 x i32], [17 x i32]* @fill_table32_partial, i64 0, i64 %idxprom
> @@ -54,9 +48,8 @@ entry:
>   define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr {
>   ; CHECK-LABEL: f64_bzhi:
>   ; CHECK:       # %bb.0: # %entry
> -; CHECK-NEXT:    andq fill_table64(,%rsi,8), %rdi
> -; CHECK-NEXT:    movq %rdi, %rax
> -; CHECK-NEXT:    ret{{[l|q]}}
> +; CHECK-NEXT:    bzhiq %rsi, %rdi, %rax
> +; CHECK-NEXT:    retq
>   ;
>   ; CHECK32-LABEL: f64_bzhi:
>   ; CHECK32:       # %bb.0: # %entry
> @@ -65,7 +58,7 @@ define i64 @f64_bzhi(i64 %x, i64 %y) loc
>   ; CHECK32-NEXT:    movl fill_table64(,%eax,8), %eax
>   ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
>   ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %edx
> -; CHECK32-NEXT:    ret{{[l|q]}}
> +; CHECK32-NEXT:    retl
>   entry:
>     %arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y
>     %0 = load i64, i64* %arrayidx, align 8
> @@ -76,9 +69,8 @@ entry:
>   define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr {
>   ; CHECK-LABEL: f64_bzhi_partial:
>   ; CHECK:       # %bb.0: # %entry
> -; CHECK-NEXT:    andq fill_table64_partial(,%rsi,8), %rdi
> -; CHECK-NEXT:    movq %rdi, %rax
> -; CHECK-NEXT:    ret{{[l|q]}}
> +; CHECK-NEXT:    bzhiq %rsi, %rdi, %rax
> +; CHECK-NEXT:    retq
>   ;
>   ; CHECK32-LABEL: f64_bzhi_partial:
>   ; CHECK32:       # %bb.0: # %entry
> @@ -87,7 +79,7 @@ define i64 @f64_bzhi_partial(i64 %x, i64
>   ; CHECK32-NEXT:    movl fill_table64_partial(,%eax,8), %eax
>   ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
>   ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %edx
> -; CHECK32-NEXT:    ret{{[l|q]}}
> +; CHECK32-NEXT:    retl
>   entry:
>     %arrayidx = getelementptr inbounds [51 x i64], [51 x i64]* @fill_table64_partial, i64 0, i64 %y
>     %0 = load i64, i64* %arrayidx, align 8
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits