[llvm] r320481 - [X86] Recognize constant arrays with special values and replace loads from it with subtract and shift instructions, which then will be replaced by X86 BZHI machine instruction.

Tue Dec 12 06:13:51 PST 2017

Author: aymanmus
Date: Tue Dec 12 06:13:51 2017
New Revision: 320481

URL: http://llvm.org/viewvc/llvm-project?rev=320481&view=rev
Log:
[X86] Recognize constant arrays with special values and replace loads from it with subtract and shift instructions, which then will be replaced by X86 BZHI machine instruction.

Recognize constant arrays with the following values:
  0x0, 0x1, 0x3, 0x7, 0xF, 0x1F, .... , 2^(size - 1) -1
where //size// is the size of the array.

the result of a load with index //idx// from this array is equivalent to the result of the following:
  (0xFFFFFFFF >> (sub 32, idx))             (assuming the array of type 32-bit integer).

And the result of an 'AND' operation on the returned value of such a load and another input, is exactly equivalent to the X86 BZHI instruction behavior.

See test cases in the LIT test for better understanding.

Differential Revision: https://reviews.llvm.org/D34141

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=320481&r1=320480&r2=320481&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Dec 12 06:13:51 2017
@@ -33066,6 +33066,124 @@ static SDValue combineAndMaskToShift(SDN
   return DAG.getBitcast(N->getValueType(0), Shift);
 }
 
+// Get the index node from the lowered DAG of a GEP IR instruction with one
+// indexing dimension.
+static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
+  if (Ld->isIndexed())
+    return SDValue();
+
+  SDValue Base = Ld->getBasePtr();
+
+  if (Base.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue ShiftedIndex = Base.getOperand(0);
+
+  if (ShiftedIndex.getOpcode() != ISD::SHL)
+    return SDValue();
+
+  return ShiftedIndex.getOperand(0);
+
+}
+
+static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
+  if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
+    switch (VT.getSizeInBits()) {
+    default: return false;
+    case 64: return Subtarget.is64Bit() ? true : false;
+    case 32: return true;
+    }
+  }
+  return false;
+}
+
+// This function recognizes cases where X86 bzhi instruction can replace and
+// 'and-load' sequence.
+// In case of loading integer value from an array of constants which is defined
+// as follows:
+//
+//   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
+//
+// then applying a bitwise and on the result with another input.
+// It's equivalent to performing bzhi (zero high bits) on the input, with the
+// same index of the load.
+static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
+    const X86Subtarget &Subtarget) {
+  MVT VT = Node->getSimpleValueType(0);
+  SDLoc dl(Node);
+
+  // Check if subtarget has BZHI instruction for the node's type
+  if (!hasBZHI(Subtarget, VT))
+    return SDValue();
+
+  // Try matching the pattern for both operands.
+  for (unsigned i = 0; i < 2; i++) {
+    SDValue N = Node->getOperand(i);
+    LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
+
+     // continue if the operand is not a load instruction
+    if (!Ld)
+      return SDValue();
+
+    const Value *MemOp = Ld->getMemOperand()->getValue();
+
+    if (!MemOp)
+      return SDValue();
+
+    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
+      if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
+        if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
+
+          Constant *Init = GV->getInitializer();
+          Type *Ty = Init->getType();
+          if (!isa<ConstantDataArray>(Init) ||
+              !Ty->getArrayElementType()->isIntegerTy() ||
+              Ty->getArrayElementType()->getScalarSizeInBits() !=
+                  VT.getSizeInBits() ||
+              Ty->getArrayNumElements() >
+                  Ty->getArrayElementType()->getScalarSizeInBits())
+            continue;
+
+          // Check if the array's constant elements are suitable to our case.
+          uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
+          bool ConstantsMatch = true;
+          for (uint64_t j = 0; j < ArrayElementCount; j++) {
+            ConstantInt *Elem =
+                dyn_cast<ConstantInt>(Init->getAggregateElement(j));
+            if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
+              ConstantsMatch = false;
+              break;
+            }
+          }
+          if (!ConstantsMatch)
+            continue;
+
+          // Do the transformation (For 32-bit type):
+          // -> (and (load arr[idx]), inp)
+          // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
+          //    that will be replaced with one bzhi instruction.
+          SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
+          SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
+
+          // Get the Node which indexes into the array.
+          SDValue Index = getIndexFromUnindexedLoad(Ld);
+          if (!Index)
+            return SDValue();
+          Index = DAG.getZExtOrTrunc(Index, dl, VT);
+
+          SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
+
+          SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
+          SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
+
+          return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -33094,6 +33212,9 @@ static SDValue combineAnd(SDNode *N, Sel
   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
     return ShiftRight;
 
+  if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
+    return R;
+
   // Attempt to recursively combine a bitmask AND with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
     SDValue Op(N, 0);

Modified: llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll?rev=320481&r1=320480&r2=320481&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll (original)
+++ llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll Tue Dec 12 06:13:51 2017
@@ -10,17 +10,14 @@
 define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr {
 ; CHECK-LABEL: f32_bzhi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    andl fill_table32(,%rax,4), %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; CHECK-NEXT:    bzhil %esi, %edi, %eax
+; CHECK-NEXT:    retq
 ;
 ; CHECK32-LABEL: f32_bzhi:
 ; CHECK32:       # %bb.0: # %entry
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movl fill_table32(,%eax,4), %eax
-; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    ret{{[l|q]}}
+; CHECK32-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    retl
 entry:
   %idxprom = sext i32 %y to i64
   %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom
@@ -32,17 +29,14 @@ entry:
 define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr {
 ; CHECK-LABEL: f32_bzhi_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    andl fill_table32_partial(,%rax,4), %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; CHECK-NEXT:    bzhil %esi, %edi, %eax
+; CHECK-NEXT:    retq
 ;
 ; CHECK32-LABEL: f32_bzhi_partial:
 ; CHECK32:       # %bb.0: # %entry
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movl fill_table32_partial(,%eax,4), %eax
-; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    ret{{[l|q]}}
+; CHECK32-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    retl
 entry:
   %idxprom = sext i32 %y to i64
   %arrayidx = getelementptr inbounds [17 x i32], [17 x i32]* @fill_table32_partial, i64 0, i64 %idxprom
@@ -54,9 +48,8 @@ entry:
 define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr {
 ; CHECK-LABEL: f64_bzhi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andq fill_table64(,%rsi,8), %rdi
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    ret{{[l|q]}}
+; CHECK-NEXT:    bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
 ;
 ; CHECK32-LABEL: f64_bzhi:
 ; CHECK32:       # %bb.0: # %entry
@@ -65,7 +58,7 @@ define i64 @f64_bzhi(i64 %x, i64 %y) loc
 ; CHECK32-NEXT:    movl fill_table64(,%eax,8), %eax
 ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; CHECK32-NEXT:    ret{{[l|q]}}
+; CHECK32-NEXT:    retl
 entry:
   %arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y
   %0 = load i64, i64* %arrayidx, align 8
@@ -76,9 +69,8 @@ entry:
 define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr {
 ; CHECK-LABEL: f64_bzhi_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andq fill_table64_partial(,%rsi,8), %rdi
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    ret{{[l|q]}}
+; CHECK-NEXT:    bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
 ;
 ; CHECK32-LABEL: f64_bzhi_partial:
 ; CHECK32:       # %bb.0: # %entry
@@ -87,7 +79,7 @@ define i64 @f64_bzhi_partial(i64 %x, i64
 ; CHECK32-NEXT:    movl fill_table64_partial(,%eax,8), %eax
 ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; CHECK32-NEXT:    ret{{[l|q]}}
+; CHECK32-NEXT:    retl
 entry:
   %arrayidx = getelementptr inbounds [51 x i64], [51 x i64]* @fill_table64_partial, i64 0, i64 %y
   %0 = load i64, i64* %arrayidx, align 8