[llvm] r211932 - [NVPTX] Add isel patterns for bit-field extract (bfe)
Matt Arsenault
arsenm2 at gmail.com
Fri Jun 27 12:01:24 PDT 2014
On Jun 27, 2014, at 11:35 AM, Justin Holewinski <jholewinski at nvidia.com> wrote:
> Author: jholewinski
> Date: Fri Jun 27 13:35:27 2014
> New Revision: 211932
>
> URL: http://llvm.org/viewvc/llvm-project?rev=211932&view=rev
> Log:
> [NVPTX] Add isel patterns for bit-field extract (bfe)
>
> Added:
> llvm/trunk/test/CodeGen/NVPTX/bfe.ll
> Modified:
> llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
> llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
> llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
>
Would it make sense to introduce a new target independent ISD node for BFE? You’ve added this, AArch64 has very similar code, and I’ve been meaning to add essentially the same to R600 for a while.
> Modified: llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp?rev=211932&r1=211931&r2=211932&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (original)
> +++ llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp Fri Jun 27 13:35:27 2014
> @@ -253,6 +253,12 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode
> case NVPTXISD::Suld3DV4I32Trap:
> ResNode = SelectSurfaceIntrinsic(N);
> break;
> + case ISD::AND:
> + case ISD::SRA:
> + case ISD::SRL:
> + // Try to select BFE
> + ResNode = SelectBFE(N);
> + break;
> case ISD::ADDRSPACECAST:
> ResNode = SelectAddrSpaceCast(N);
> break;
> @@ -2959,6 +2965,214 @@ SDNode *NVPTXDAGToDAGISel::SelectSurface
> return Ret;
> }
>
> +/// SelectBFE - Look for instruction sequences that can be made more efficient
> +/// by using the 'bfe' (bit-field extract) PTX instruction
> +SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
> + SDValue LHS = N->getOperand(0);
> + SDValue RHS = N->getOperand(1);
> + SDValue Len;
> + SDValue Start;
> + SDValue Val;
> + bool IsSigned = false;
> +
> + if (N->getOpcode() == ISD::AND) {
> + // Canonicalize the operands
> + // We want 'and %val, %mask'
> + if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
> + std::swap(LHS, RHS);
> + }
> +
> + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
> + if (!Mask) {
> + // We need a constant mask on the RHS of the AND
> + return NULL;
> + }
> +
> + // Extract the mask bits
> + uint64_t MaskVal = Mask->getZExtValue();
> + if (!isMask_64(MaskVal)) {
> + // We *could* handle shifted masks here, but doing so would require an
> + // 'and' operation to fix up the low-order bits so we would trade
> + // shr+and for bfe+and, which has the same throughput
> + return NULL;
> + }
> +
> + // How many bits are in our mask?
> + uint64_t NumBits = CountTrailingOnes_64(MaskVal);
> + Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
> +
> + if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
> + // We have a 'srl/and' pair, extract the effective start bit and length
> + Val = LHS.getNode()->getOperand(0);
> + Start = LHS.getNode()->getOperand(1);
> + ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
> + if (StartConst) {
> + uint64_t StartVal = StartConst->getZExtValue();
> + // How many "good" bits do we have left? "good" is defined here as bits
> + // that exist in the original value, not shifted in.
> + uint64_t GoodBits = Start.getValueType().getSizeInBits() - StartVal;
> + if (NumBits > GoodBits) {
> + // Do not handle the case where bits have been shifted in. In theory
> + // we could handle this, but the cost is likely higher than just
> + // emitting the srl/and pair.
> + return NULL;
> + }
> + Start = CurDAG->getTargetConstant(StartVal, MVT::i32);
> + } else {
> + // Do not handle the case where the shift amount (can be zero if no srl
> + // was found) is not constant. We could handle this case, but it would
> + // require run-time logic that would be more expensive than just
> + // emitting the srl/and pair.
> + return NULL;
> + }
> + } else {
> + // Do not handle the case where the LHS of the and is not a shift. While
> + // it would be trivial to handle this case, it would just transform
> + // 'and' -> 'bfe', but 'and' has higher-throughput.
> + return NULL;
> + }
> + } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
> + if (LHS->getOpcode() == ISD::AND) {
> + ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
> + if (!ShiftCnst) {
> + // Shift amount must be constant
> + return NULL;
> + }
> +
> + uint64_t ShiftAmt = ShiftCnst->getZExtValue();
> +
> + SDValue AndLHS = LHS->getOperand(0);
> + SDValue AndRHS = LHS->getOperand(1);
> +
> + // Canonicalize the AND to have the mask on the RHS
> + if (isa<ConstantSDNode>(AndLHS)) {
> + std::swap(AndLHS, AndRHS);
> + }
> +
> + ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
> + if (!MaskCnst) {
> + // Mask must be constant
> + return NULL;
> + }
> +
> + uint64_t MaskVal = MaskCnst->getZExtValue();
> + uint64_t NumZeros;
> + uint64_t NumBits;
> + if (isMask_64(MaskVal)) {
> + NumZeros = 0;
> + // The number of bits in the result bitfield will be the number of
> + // trailing ones (the AND) minus the number of bits we shift off
> + NumBits = CountTrailingOnes_64(MaskVal) - ShiftAmt;
> + } else if (isShiftedMask_64(MaskVal)) {
> + NumZeros = countTrailingZeros(MaskVal);
> + unsigned NumOnes = CountTrailingOnes_64(MaskVal >> NumZeros);
> + // The number of bits in the result bitfield will be the number of
> + // trailing zeros plus the number of set bits in the mask minus the
> + // number of bits we shift off
> + NumBits = NumZeros + NumOnes - ShiftAmt;
> + } else {
> + // This is not a mask we can handle
> + return NULL;
> + }
> +
> + if (ShiftAmt < NumZeros) {
> + // Handling this case would require extra logic that would make this
> + // transformation non-profitable
> + return NULL;
> + }
> +
> + Val = AndLHS;
> + Start = CurDAG->getTargetConstant(ShiftAmt, MVT::i32);
> + Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
> + } else if (LHS->getOpcode() == ISD::SHL) {
> + // Here, we have a pattern like:
> + //
> + // (sra (shl val, NN), MM)
> + // or
> + // (srl (shl val, NN), MM)
> + //
> + // If MM >= NN, we can efficiently optimize this with bfe
> + Val = LHS->getOperand(0);
> +
> + SDValue ShlRHS = LHS->getOperand(1);
> + ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
> + if (!ShlCnst) {
> + // Shift amount must be constant
> + return NULL;
> + }
> + uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
> +
> + SDValue ShrRHS = RHS;
> + ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
> + if (!ShrCnst) {
> + // Shift amount must be constant
> + return NULL;
> + }
> + uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
> +
> + // To avoid extra codegen and be profitable, we need Outer >= Inner
> + if (OuterShiftAmt < InnerShiftAmt) {
> + return NULL;
> + }
> +
> + // If the outer shift is more than the type size, we have no bitfield to
> + // extract (since we also check that the inner shift is <= the outer shift
> + // then this also implies that the inner shift is < the type size)
> + if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) {
> + return NULL;
> + }
> +
> + Start =
> + CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, MVT::i32);
> + Len =
> + CurDAG->getTargetConstant(Val.getValueType().getSizeInBits() -
> + OuterShiftAmt, MVT::i32);
> +
> + if (N->getOpcode() == ISD::SRA) {
> + // If we have a arithmetic right shift, we need to use the signed bfe
> + // variant
> + IsSigned = true;
> + }
> + } else {
> + // No can do...
> + return NULL;
> + }
> + } else {
> + // No can do...
> + return NULL;
> + }
> +
> +
> + unsigned Opc;
> + // For the BFE operations we form here from "and" and "srl", always use the
> + // unsigned variants.
> + if (Val.getValueType() == MVT::i32) {
> + if (IsSigned) {
> + Opc = NVPTX::BFE_S32rii;
> + } else {
> + Opc = NVPTX::BFE_U32rii;
> + }
> + } else if (Val.getValueType() == MVT::i64) {
> + if (IsSigned) {
> + Opc = NVPTX::BFE_S64rii;
> + } else {
> + Opc = NVPTX::BFE_U64rii;
> + }
> + } else {
> + // We cannot handle this type
> + return NULL;
> + }
> +
> + SDValue Ops[] = {
> + Val, Start, Len
> + };
> +
> + SDNode *Ret =
> + CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
> +
> + return Ret;
> +}
> +
> // SelectDirectAddr - Match a direct address for DAG.
> // A direct address could be a globaladdress or externalsymbol.
> bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
>
> Modified: llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.h?rev=211932&r1=211931&r2=211932&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.h (original)
> +++ llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.h Fri Jun 27 13:35:27 2014
> @@ -71,6 +71,7 @@ private:
> SDNode *SelectAddrSpaceCast(SDNode *N);
> SDNode *SelectTextureIntrinsic(SDNode *N);
> SDNode *SelectSurfaceIntrinsic(SDNode *N);
> + SDNode *SelectBFE(SDNode *N);
>
> inline SDValue getI32Imm(unsigned Imm) {
> return CurDAG->getTargetConstant(Imm, MVT::i32);
>
> Modified: llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td?rev=211932&r1=211931&r2=211932&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td (original)
> +++ llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td Fri Jun 27 13:35:27 2014
> @@ -1179,6 +1179,29 @@ def ROTR64reg_sw : NVPTXInst<(outs Int64
> !strconcat("}}", ""))))))))),
> [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
>
> +// BFE - bit-field extract
> +
> +multiclass BFE<string TyStr, RegisterClass RC> {
> + // BFE supports both 32-bit and 64-bit values, but the start and length
> + // operands are always 32-bit
> + def rrr
> + : NVPTXInst<(outs RC:$d),
> + (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
> + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
> + def rri
> + : NVPTXInst<(outs RC:$d),
> + (ins RC:$a, Int32Regs:$b, i32imm:$c),
> + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
> + def rii
> + : NVPTXInst<(outs RC:$d),
> + (ins RC:$a, i32imm:$b, i32imm:$c),
> + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
> +}
> +
> +defm BFE_S32 : BFE<"s32", Int32Regs>;
> +defm BFE_U32 : BFE<"u32", Int32Regs>;
> +defm BFE_S64 : BFE<"s64", Int64Regs>;
> +defm BFE_U64 : BFE<"u64", Int64Regs>;
>
> //-----------------------------------
> // General Comparison
>
> Added: llvm/trunk/test/CodeGen/NVPTX/bfe.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/NVPTX/bfe.ll?rev=211932&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/NVPTX/bfe.ll (added)
> +++ llvm/trunk/test/CodeGen/NVPTX/bfe.ll Fri Jun 27 13:35:27 2014
> @@ -0,0 +1,32 @@
> +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
> +
> +
> +; CHECK: bfe0
> +define i32 @bfe0(i32 %a) {
> +; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 4, 4
> +; CHECK-NOT: shr
> +; CHECK-NOT: and
> + %val0 = ashr i32 %a, 4
> + %val1 = and i32 %val0, 15
> + ret i32 %val1
> +}
> +
> +; CHECK: bfe1
> +define i32 @bfe1(i32 %a) {
> +; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 3, 3
> +; CHECK-NOT: shr
> +; CHECK-NOT: and
> + %val0 = ashr i32 %a, 3
> + %val1 = and i32 %val0, 7
> + ret i32 %val1
> +}
> +
> +; CHECK: bfe2
> +define i32 @bfe2(i32 %a) {
> +; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 5, 3
> +; CHECK-NOT: shr
> +; CHECK-NOT: and
> + %val0 = ashr i32 %a, 5
> + %val1 = and i32 %val0, 7
> + ret i32 %val1
> +}
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list