[llvm] r320481 - [X86] Recognize constant arrays with special values and replace loads from it with subtract and shift instructions, which then will be replaced by X86 BZHI machine instruction.
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 13 18:44:31 PST 2017
Out of curiosity, why do this in the backend? Replacing the load from
such a constant array with the 2^(size-1)-1 sequence in IR would seem
like a generally useful optimization. Trading a load for a shift and
subtract is clearly profitable. Once that's visible in the IR, we could
pattern match the BZHI pretty straight forwardly.
Philip
On 12/12/2017 06:13 AM, Ayman Musa via llvm-commits wrote:
> Author: aymanmus
> Date: Tue Dec 12 06:13:51 2017
> New Revision: 320481
>
> URL: http://llvm.org/viewvc/llvm-project?rev=320481&view=rev
> Log:
> [X86] Recognize constant arrays with special values and replace loads from it with subtract and shift instructions, which then will be replaced by X86 BZHI machine instruction.
>
> Recognize constant arrays with the following values:
> 0x0, 0x1, 0x3, 0x7, 0xF, 0x1F, .... , 2^(size - 1) -1
> where //size// is the size of the array.
>
> the result of a load with index //idx// from this array is equivalent to the result of the following:
> (0xFFFFFFFF >> (sub 32, idx)) (assuming the array of type 32-bit integer).
>
> And the result of an 'AND' operation on the returned value of such a load and another input, is exactly equivalent to the X86 BZHI instruction behavior.
>
> See test cases in the LIT test for better understanding.
>
> Differential Revision: https://reviews.llvm.org/D34141
>
> Modified:
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=320481&r1=320480&r2=320481&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Dec 12 06:13:51 2017
> @@ -33066,6 +33066,124 @@ static SDValue combineAndMaskToShift(SDN
> return DAG.getBitcast(N->getValueType(0), Shift);
> }
>
> +// Get the index node from the lowered DAG of a GEP IR instruction with one
> +// indexing dimension.
> +static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
> + if (Ld->isIndexed())
> + return SDValue();
> +
> + SDValue Base = Ld->getBasePtr();
> +
> + if (Base.getOpcode() != ISD::ADD)
> + return SDValue();
> +
> + SDValue ShiftedIndex = Base.getOperand(0);
> +
> + if (ShiftedIndex.getOpcode() != ISD::SHL)
> + return SDValue();
> +
> + return ShiftedIndex.getOperand(0);
> +
> +}
> +
> +static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
> + if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
> + switch (VT.getSizeInBits()) {
> + default: return false;
> + case 64: return Subtarget.is64Bit() ? true : false;
> + case 32: return true;
> + }
> + }
> + return false;
> +}
> +
> +// This function recognizes cases where X86 bzhi instruction can replace and
> +// 'and-load' sequence.
> +// In case of loading integer value from an array of constants which is defined
> +// as follows:
> +//
> +// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
> +//
> +// then applying a bitwise and on the result with another input.
> +// It's equivalent to performing bzhi (zero high bits) on the input, with the
> +// same index of the load.
> +static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
> + const X86Subtarget &Subtarget) {
> + MVT VT = Node->getSimpleValueType(0);
> + SDLoc dl(Node);
> +
> + // Check if subtarget has BZHI instruction for the node's type
> + if (!hasBZHI(Subtarget, VT))
> + return SDValue();
> +
> + // Try matching the pattern for both operands.
> + for (unsigned i = 0; i < 2; i++) {
> + SDValue N = Node->getOperand(i);
> + LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
> +
> + // continue if the operand is not a load instruction
> + if (!Ld)
> + return SDValue();
> +
> + const Value *MemOp = Ld->getMemOperand()->getValue();
> +
> + if (!MemOp)
> + return SDValue();
> +
> + if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
> + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
> + if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
> +
> + Constant *Init = GV->getInitializer();
> + Type *Ty = Init->getType();
> + if (!isa<ConstantDataArray>(Init) ||
> + !Ty->getArrayElementType()->isIntegerTy() ||
> + Ty->getArrayElementType()->getScalarSizeInBits() !=
> + VT.getSizeInBits() ||
> + Ty->getArrayNumElements() >
> + Ty->getArrayElementType()->getScalarSizeInBits())
> + continue;
> +
> + // Check if the array's constant elements are suitable to our case.
> + uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
> + bool ConstantsMatch = true;
> + for (uint64_t j = 0; j < ArrayElementCount; j++) {
> + ConstantInt *Elem =
> + dyn_cast<ConstantInt>(Init->getAggregateElement(j));
> + if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
> + ConstantsMatch = false;
> + break;
> + }
> + }
> + if (!ConstantsMatch)
> + continue;
> +
> + // Do the transformation (For 32-bit type):
> + // -> (and (load arr[idx]), inp)
> + // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
> + // that will be replaced with one bzhi instruction.
> + SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
> + SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
> +
> + // Get the Node which indexes into the array.
> + SDValue Index = getIndexFromUnindexedLoad(Ld);
> + if (!Index)
> + return SDValue();
> + Index = DAG.getZExtOrTrunc(Index, dl, VT);
> +
> + SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
> +
> + SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
> + SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
> +
> + return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
> + }
> + }
> + }
> + }
> + return SDValue();
> +}
> +
> static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
> TargetLowering::DAGCombinerInfo &DCI,
> const X86Subtarget &Subtarget) {
> @@ -33094,6 +33212,9 @@ static SDValue combineAnd(SDNode *N, Sel
> if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
> return ShiftRight;
>
> + if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
> + return R;
> +
> // Attempt to recursively combine a bitmask AND with shuffles.
> if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
> SDValue Op(N, 0);
>
> Modified: llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll?rev=320481&r1=320480&r2=320481&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/replace-load-and-with-bzhi.ll Tue Dec 12 06:13:51 2017
> @@ -10,17 +10,14 @@
> define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr {
> ; CHECK-LABEL: f32_bzhi:
> ; CHECK: # %bb.0: # %entry
> -; CHECK-NEXT: movslq %esi, %rax
> -; CHECK-NEXT: andl fill_table32(,%rax,4), %edi
> -; CHECK-NEXT: movl %edi, %eax
> -; CHECK-NEXT: ret{{[l|q]}}
> +; CHECK-NEXT: bzhil %esi, %edi, %eax
> +; CHECK-NEXT: retq
> ;
> ; CHECK32-LABEL: f32_bzhi:
> ; CHECK32: # %bb.0: # %entry
> ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
> -; CHECK32-NEXT: movl fill_table32(,%eax,4), %eax
> -; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
> -; CHECK32-NEXT: ret{{[l|q]}}
> +; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax
> +; CHECK32-NEXT: retl
> entry:
> %idxprom = sext i32 %y to i64
> %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom
> @@ -32,17 +29,14 @@ entry:
> define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr {
> ; CHECK-LABEL: f32_bzhi_partial:
> ; CHECK: # %bb.0: # %entry
> -; CHECK-NEXT: movslq %esi, %rax
> -; CHECK-NEXT: andl fill_table32_partial(,%rax,4), %edi
> -; CHECK-NEXT: movl %edi, %eax
> -; CHECK-NEXT: ret{{[l|q]}}
> +; CHECK-NEXT: bzhil %esi, %edi, %eax
> +; CHECK-NEXT: retq
> ;
> ; CHECK32-LABEL: f32_bzhi_partial:
> ; CHECK32: # %bb.0: # %entry
> ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
> -; CHECK32-NEXT: movl fill_table32_partial(,%eax,4), %eax
> -; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
> -; CHECK32-NEXT: ret{{[l|q]}}
> +; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax
> +; CHECK32-NEXT: retl
> entry:
> %idxprom = sext i32 %y to i64
> %arrayidx = getelementptr inbounds [17 x i32], [17 x i32]* @fill_table32_partial, i64 0, i64 %idxprom
> @@ -54,9 +48,8 @@ entry:
> define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr {
> ; CHECK-LABEL: f64_bzhi:
> ; CHECK: # %bb.0: # %entry
> -; CHECK-NEXT: andq fill_table64(,%rsi,8), %rdi
> -; CHECK-NEXT: movq %rdi, %rax
> -; CHECK-NEXT: ret{{[l|q]}}
> +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
> +; CHECK-NEXT: retq
> ;
> ; CHECK32-LABEL: f64_bzhi:
> ; CHECK32: # %bb.0: # %entry
> @@ -65,7 +58,7 @@ define i64 @f64_bzhi(i64 %x, i64 %y) loc
> ; CHECK32-NEXT: movl fill_table64(,%eax,8), %eax
> ; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
> ; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx
> -; CHECK32-NEXT: ret{{[l|q]}}
> +; CHECK32-NEXT: retl
> entry:
> %arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y
> %0 = load i64, i64* %arrayidx, align 8
> @@ -76,9 +69,8 @@ entry:
> define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr {
> ; CHECK-LABEL: f64_bzhi_partial:
> ; CHECK: # %bb.0: # %entry
> -; CHECK-NEXT: andq fill_table64_partial(,%rsi,8), %rdi
> -; CHECK-NEXT: movq %rdi, %rax
> -; CHECK-NEXT: ret{{[l|q]}}
> +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
> +; CHECK-NEXT: retq
> ;
> ; CHECK32-LABEL: f64_bzhi_partial:
> ; CHECK32: # %bb.0: # %entry
> @@ -87,7 +79,7 @@ define i64 @f64_bzhi_partial(i64 %x, i64
> ; CHECK32-NEXT: movl fill_table64_partial(,%eax,8), %eax
> ; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
> ; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx
> -; CHECK32-NEXT: ret{{[l|q]}}
> +; CHECK32-NEXT: retl
> entry:
> %arrayidx = getelementptr inbounds [51 x i64], [51 x i64]* @fill_table64_partial, i64 0, i64 %y
> %0 = load i64, i64* %arrayidx, align 8
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list