[llvm] r206774 - ARM64: Combine shifts and uses from different basic block to bit-extract instruction

Mon Apr 21 15:06:56 PDT 2014

This change seems to have broken the build of compiler-rt:

http://llvm.org/bugs/show_bug.cgi?id=19503

On Mon, Apr 21, 2014 at 12:34 PM, Yi Jiang <yjiang at apple.com> wrote:

> Author: yjiang
> Date: Mon Apr 21 14:34:27 2014
> New Revision: 206774
>
> URL: http://llvm.org/viewvc/llvm-project?rev=206774&view=rev
> Log:
> ARM64: Combine shifts and uses from different basic block to bit-extract
> instruction
>
> Modified:
>     llvm/trunk/include/llvm/Target/TargetLowering.h
>     llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
>     llvm/trunk/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
>     llvm/trunk/lib/Target/ARM64/ARM64ISelLowering.cpp
>     llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll
>
> Modified: llvm/trunk/include/llvm/Target/TargetLowering.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetLowering.h?rev=206774&r1=206773&r2=206774&view=diff
>
> ==============================================================================
> --- llvm/trunk/include/llvm/Target/TargetLowering.h (original)
> +++ llvm/trunk/include/llvm/Target/TargetLowering.h Mon Apr 21 14:34:27
> 2014
> @@ -182,6 +182,9 @@ public:
>      return HasMultipleConditionRegisters;
>    }
>
> +  /// Return true if the target has BitExtract instructions.
> +  bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
> +
>    /// Return true if a vector of the given type should be split
>    /// (TypeSplitVector) instead of promoted (TypePromoteInteger) during
> type
>    /// legalization.
> @@ -1010,6 +1013,14 @@ protected:
>      HasMultipleConditionRegisters = hasManyRegs;
>    }
>
> +  /// Tells the code generator that the target has BitExtract
> instructions.
> +  /// The code generator will aggressively sink "shift"s into the blocks
> of
> +  /// their users if the users will generate "and" instructions which can
> be
> +  /// combined with "shift" to BitExtract instructions.
> +  void setHasExtractBitsInsn(bool hasExtractInsn = true) {
> +    HasExtractBitsInsn = hasExtractInsn;
> +  }
> +
>    /// Tells the code generator not to expand sequence of operations into a
>    /// separate sequences that increases the amount of flow control.
>    void setJumpIsExpensive(bool isExpensive = true) {
> @@ -1436,6 +1447,12 @@ private:
>    /// the blocks of their users.
>    bool HasMultipleConditionRegisters;
>
> +  /// Tells the code generator that the target has BitExtract
> instructions.
> +  /// The code generator will aggressively sink "shift"s into the blocks
> of
> +  /// their users if the users will generate "and" instructions which can
> be
> +  /// combined with "shift" to BitExtract instructions.
> +  bool HasExtractBitsInsn;
> +
>    /// Tells the code generator not to expand integer divides by constants
> into a
>    /// sequence of muls, adds, and shifts.  This is a hack until a real
> cost
>    /// model is in place.  If we ever optimize for size, this will be set
> to true
>
> Modified: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp?rev=206774&r1=206773&r2=206774&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp (original)
> +++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp Mon Apr 21 14:34:27 2014
> @@ -628,6 +628,187 @@ static bool OptimizeCmpExpression(CmpIns
>    return MadeChange;
>  }
>
> +/// isExtractBitsCandidateUse - Check if the candidates could
> +/// be combined with shift instruction, which includes:
> +/// 1. Truncate instruction
> +/// 2. And instruction and the imm is a mask of the low bits:
> +/// imm & (imm+1) == 0
> +bool isExtractBitsCandidateUse(Instruction *User) {
> +  if (!isa<TruncInst>(User)) {
> +    if (User->getOpcode() != Instruction::And ||
> +        !isa<ConstantInt>(User->getOperand(1)))
> +      return false;
> +
> +    unsigned Cimm =
> dyn_cast<ConstantInt>(User->getOperand(1))->getZExtValue();
> +
> +    if (Cimm & (Cimm + 1))
> +      return false;
> +  }
> +  return true;
> +}
> +
> +/// SinkShiftAndTruncate - sink both shift and truncate instruction
> +/// to the use of truncate's BB.
> +bool
> +SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User,
> ConstantInt *CI,
> +                     DenseMap<BasicBlock *, BinaryOperator *>
> &InsertedShifts,
> +                     const TargetLowering &TLI) {
> +  BasicBlock *UserBB = User->getParent();
> +  DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
> +  TruncInst *TruncI = dyn_cast<TruncInst>(User);
> +  bool MadeChange = false;
> +
> +  for (Value::user_iterator TruncUI = TruncI->user_begin(),
> +                            TruncE = TruncI->user_end();
> +       TruncUI != TruncE;) {
> +
> +    Use &TruncTheUse = TruncUI.getUse();
> +    Instruction *TruncUser = cast<Instruction>(*TruncUI);
> +    // Preincrement use iterator so we don't invalidate it.
> +
> +    ++TruncUI;
> +
> +    int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
> +    if (!ISDOpcode)
> +      continue;
> +
> +    // If the use is actually a legal node, there will not be an implicit
> +    // truncate.
> +    if (TLI.isOperationLegalOrCustom(ISDOpcode,
> +                                     EVT::getEVT(TruncUser->getType())))
> +      continue;
> +
> +    // Don't bother for PHI nodes.
> +    if (isa<PHINode>(TruncUser))
> +      continue;
> +
> +    BasicBlock *TruncUserBB = TruncUser->getParent();
> +
> +    if (UserBB == TruncUserBB)
> +      continue;
> +
> +    BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
> +    CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
> +
> +    if (!InsertedShift && !InsertedTrunc) {
> +      BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
> +      // Sink the shift
> +      if (ShiftI->getOpcode() == Instruction::AShr)
> +        InsertedShift =
> +            BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "",
> InsertPt);
> +      else
> +        InsertedShift =
> +            BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "",
> InsertPt);
> +
> +      // Sink the trunc
> +      BasicBlock::iterator TruncInsertPt =
> TruncUserBB->getFirstInsertionPt();
> +      TruncInsertPt++;
> +
> +      InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
> +                                       TruncI->getType(), "",
> TruncInsertPt);
> +
> +      MadeChange = true;
> +
> +      TruncTheUse = InsertedTrunc;
> +    }
> +  }
> +  return MadeChange;
> +}
> +
> +/// OptimizeExtractBits - sink the shift *right* instruction into user
> blocks if
> +/// the uses could potentially be combined with this shift instruction and
> +/// generate BitExtract instruction. It will only be applied if the
> architecture
> +/// supports BitExtract instruction. Here is an example:
> +/// BB1:
> +///   %x.extract.shift = lshr i64 %arg1, 32
> +/// BB2:
> +///   %x.extract.trunc = trunc i64 %x.extract.shift to i16
> +/// ==>
> +///
> +/// BB2:
> +///   %x.extract.shift.1 = lshr i64 %arg1, 32
> +///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
> +///
> +/// CodeGen will recoginze the pattern in BB2 and generate BitExtract
> +/// instruction.
> +/// Return true if any changes are made.
> +static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
> +                                const TargetLowering &TLI) {
> +  BasicBlock *DefBB = ShiftI->getParent();
> +
> +  /// Only insert instructions in each block once.
> +  DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
> +
> +  bool shiftIsLegal =
> TLI.isTypeLegal(TLI.getValueType(ShiftI->getType()));
> +
> +  bool MadeChange = false;
> +  for (Value::user_iterator UI = ShiftI->user_begin(), E =
> ShiftI->user_end();
> +       UI != E;) {
> +    Use &TheUse = UI.getUse();
> +    Instruction *User = cast<Instruction>(*UI);
> +    // Preincrement use iterator so we don't invalidate it.
> +    ++UI;
> +
> +    // Don't bother for PHI nodes.
> +    if (isa<PHINode>(User))
> +      continue;
> +
> +    if (!isExtractBitsCandidateUse(User))
> +      continue;
> +
> +    BasicBlock *UserBB = User->getParent();
> +
> +    if (UserBB == DefBB) {
> +      // If the shift and truncate instruction are in the same BB. The
> use of
> +      // the truncate(TruncUse) may still introduce another truncate if
> not
> +      // legal. In this case, we would like to sink both shift and
> truncate
> +      // instruction to the BB of TruncUse.
> +      // for example:
> +      // BB1:
> +      // i64 shift.result = lshr i64 opnd, imm
> +      // trunc.result = trunc shift.result to i16
> +      //
> +      // BB2:
> +      //   ----> We will have an implicit truncate here if the
> architecture does
> +      //   not have i16 compare.
> +      // cmp i16 trunc.result, opnd2
> +      //
> +      if (isa<TruncInst>(User) && shiftIsLegal
> +          // If the type of the truncate is legal, no trucate will be
> +          // introduced in other basic blocks.
> +          && (!TLI.isTypeLegal(TLI.getValueType(User->getType()))))
> +        MadeChange =
> +            SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI);
> +
> +      continue;
> +    }
> +    // If we have already inserted a shift into this block, use it.
> +    BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
> +
> +    if (!InsertedShift) {
> +      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
> +
> +      if (ShiftI->getOpcode() == Instruction::AShr)
> +        InsertedShift =
> +            BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "",
> InsertPt);
> +      else
> +        InsertedShift =
> +            BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "",
> InsertPt);
> +
> +      MadeChange = true;
> +    }
> +
> +    // Replace a use of the shift with a use of the new shift.
> +    TheUse = InsertedShift;
> +  }
> +
> +  // If we removed all uses, nuke the shift.
> +  if (ShiftI->use_empty())
> +    ShiftI->eraseFromParent();
> +
> +  return MadeChange;
> +}
> +
>  namespace {
>  class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
>  protected:
> @@ -3225,6 +3406,17 @@ bool CodeGenPrepare::OptimizeInst(Instru
>      return false;
>    }
>
> +  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
> +
> +  if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
> +                BinOp->getOpcode() == Instruction::LShr)) {
> +    ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
> +    if (TLI && CI && TLI->hasExtractBitsInsn())
> +      return OptimizeExtractBits(BinOp, CI, *TLI);
> +
> +    return false;
> +  }
> +
>    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
>      if (GEPI->hasAllZeroIndices()) {
>        /// The GEP operand must be a pointer, so must its result -> BitCast
>
> Modified: llvm/trunk/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp?rev=206774&r1=206773&r2=206774&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp (original)
> +++ llvm/trunk/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp Mon Apr 21 14:34:27
> 2014
> @@ -1183,6 +1183,14 @@ static bool isBitfieldExtractOpFromAnd(S
>      // Make sure to clamp the MSB so that we preserve the semantics of the
>      // original operations.
>      ClampMSB = true;
> +  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
> +             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
> +                                   Srl_imm)) {
> +    // If the shift result was truncated, we can still combine them.
> +    Opd0 = Op0->getOperand(0).getOperand(0);
> +
> +    // Use the type of SRL node.
> +    VT = Opd0->getValueType(0);
>    } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
>      Opd0 = Op0->getOperand(0);
>    } else if (BiggerPattern) {
> @@ -1277,8 +1285,19 @@ static bool isBitfieldExtractOpFromShr(S
>
>    // we're looking for a shift of a shift
>    uint64_t Shl_imm = 0;
> +  uint64_t Trunc_bits = 0;
>    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL,
> Shl_imm)) {
>      Opd0 = N->getOperand(0).getOperand(0);
> +  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
> +             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
> +    // We are looking for a shift of truncate. Truncate from i64 to i32
> could
> +    // be considered as setting high 32 bits as zero. Our strategy here
> is to
> +    // always generate 64bit UBFM. This consistency will help the CSE pass
> +    // later find more redundancy.
> +    Opd0 = N->getOperand(0).getOperand(0);
> +    Trunc_bits = Opd0->getValueType(0).getSizeInBits() -
> VT.getSizeInBits();
> +    VT = Opd0->getValueType(0);
> +    assert(VT == MVT::i64 && "the promoted type should be i64");
>    } else if (BiggerPattern) {
>      // Let's pretend a 0 shift left has been performed.
>      // FIXME: Currently we limit this to the bigger pattern case,
> @@ -1295,7 +1314,7 @@ static bool isBitfieldExtractOpFromShr(S
>    assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
>           "bad amount in shift node!");
>    // Note: The width operand is encoded as width-1.
> -  unsigned Width = VT.getSizeInBits() - Srl_imm - 1;
> +  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
>    int sLSB = Srl_imm - Shl_imm;
>    if (sLSB < 0)
>      return false;
> @@ -1354,8 +1373,23 @@ SDNode *ARM64DAGToDAGISel::SelectBitfiel
>      return NULL;
>
>    EVT VT = N->getValueType(0);
> -  SDValue Ops[] = { Opd0, CurDAG->getTargetConstant(LSB, VT),
> -                    CurDAG->getTargetConstant(MSB, VT) };
> +
> +  // If the bit extract operation is 64bit but the original type is
> 32bit, we
> +  // need to add one EXTRACT_SUBREG.
> +  if ((Opc == ARM64::SBFMXri || Opc == ARM64::UBFMXri) && VT == MVT::i32)
> {
> +    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
> +                       CurDAG->getTargetConstant(MSB, MVT::i64)};
> +
> +    SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
> +    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
> +    MachineSDNode *Node =
> +        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N),
> MVT::i32,
> +                               SDValue(BFM, 0), SubReg);
> +    return Node;
> +  }
> +
> +  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
> +                   CurDAG->getTargetConstant(MSB, VT)};
>    return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
>  }
>
>
> Modified: llvm/trunk/lib/Target/ARM64/ARM64ISelLowering.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/ARM64ISelLowering.cpp?rev=206774&r1=206773&r2=206774&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM64/ARM64ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/ARM64/ARM64ISelLowering.cpp Mon Apr 21 14:34:27
> 2014
> @@ -438,6 +438,8 @@ ARM64TargetLowering::ARM64TargetLowering
>    setDivIsWellDefined(true);
>
>    RequireStrictAlign = StrictAlign;
> +
> +  setHasExtractBitsInsn(true);
>  }
>
>  void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
>
> Modified: llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll?rev=206774&r1=206773&r2=206774&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll (original)
> +++ llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll Mon Apr 21 14:34:27
> 2014
> @@ -1,3 +1,4 @@
> +; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s |
> FileCheck --check-prefix=OPT %s
>  ; RUN: llc < %s -march=arm64 | FileCheck %s
>  %struct.X = type { i8, i8, [2 x i8] }
>  %struct.Y = type { i32, i8 }
> @@ -404,3 +405,75 @@ define i64 @fct18(i32 %xor72) nounwind s
>    %result = and i64 %conv82, 255
>    ret i64 %result
>  }
> +
> +; Using the access to the global array to keep the instruction and
> control flow.
> + at first_ones = external global [65536 x i8]
> +
> +; Function Attrs: nounwind readonly ssp
> +define i32 @fct19(i64 %arg1) nounwind readonly ssp  {
> +; CHECK-LABEL: fct19:
> +entry:
> +  %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
> +  %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
> +  %x.sroa.3.0.extract.shift = lshr i64 %arg1, 32
> +  %x.sroa.5.0.extract.shift = lshr i64 %arg1, 48
> +  %tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0
> +  br i1 %tobool, label %if.end, label %if.then
> +
> +if.then:                                          ; preds = %entry
> +  %arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0,
> i64 %x.sroa.5.0.extract.shift
> +  %0 = load i8* %arrayidx3, align 1
> +  %conv = zext i8 %0 to i32
> +  br label %return
> +
> +; OPT-LABEL: if.end
> +if.end:                                           ; preds = %entry
> +; OPT: lshr
> +; CHECK: ubfm  [[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #47
> +  %x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16
> +  %tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0
> +; CHECK: cbz
> +  br i1 %tobool6, label %if.end13, label %if.then7
> +
> +; OPT-LABEL: if.then7
> +if.then7:                                         ; preds = %if.end
> +; OPT: lshr
> +; "and" should be combined to "ubfm" while "ubfm" should be removed by
> cse.
> +; So neither of them should be in the assemble code.
> +; CHECK-NOT: and
> +; CHECK-NOT: ubfm
> +  %idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535
> +  %arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0,
> i64 %idxprom10
> +  %1 = load i8* %arrayidx11, align 1
> +  %conv12 = zext i8 %1 to i32
> +  %add = add nsw i32 %conv12, 16
> +  br label %return
> +
> +; OPT-LABEL: if.end13
> +if.end13:                                         ; preds = %if.end
> +; OPT: lshr
> +; OPT: trunc
> +; CHECK: ubfm  [[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #31
> +  %tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0
> +; CHECK: cbz
> +  br i1 %tobool16, label %return, label %if.then17
> +
> +; OPT-LABEL: if.then17
> +if.then17:                                        ; preds = %if.end13
> +; OPT: lshr
> +; "and" should be combined to "ubfm" while "ubfm" should be removed by
> cse.
> +; So neither of them should be in the assemble code.
> +; CHECK-NOT: and
> +; CHECK-NOT: ubfm
> +  %idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535
> +  %arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0,
> i64 %idxprom20
> +  %2 = load i8* %arrayidx21, align 1
> +  %conv22 = zext i8 %2 to i32
> +  %add23 = add nsw i32 %conv22, 32
> +  br label %return
> +
> +return:                                           ; preds = %if.end13,
> %if.then17, %if.then7, %if.then
> +; CHECK: ret
> +  %retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23,
> %if.then17 ], [ 64, %if.end13 ]
> +  ret i32 %retval.0
> +}
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140421/d9d4936a/attachment.html>