[llvm] r206774 - ARM64: Combine shifts and uses from different basic block to bit-extract instruction
Richard Smith
richard at metafoo.co.uk
Mon Apr 21 15:06:56 PDT 2014
This change seems to have broken the build of compiler-rt:
http://llvm.org/bugs/show_bug.cgi?id=19503
On Mon, Apr 21, 2014 at 12:34 PM, Yi Jiang <yjiang at apple.com> wrote:
> Author: yjiang
> Date: Mon Apr 21 14:34:27 2014
> New Revision: 206774
>
> URL: http://llvm.org/viewvc/llvm-project?rev=206774&view=rev
> Log:
> ARM64: Combine shifts and uses from different basic block to bit-extract
> instruction
>
> Modified:
> llvm/trunk/include/llvm/Target/TargetLowering.h
> llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
> llvm/trunk/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
> llvm/trunk/lib/Target/ARM64/ARM64ISelLowering.cpp
> llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll
>
> Modified: llvm/trunk/include/llvm/Target/TargetLowering.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetLowering.h?rev=206774&r1=206773&r2=206774&view=diff
>
> ==============================================================================
> --- llvm/trunk/include/llvm/Target/TargetLowering.h (original)
> +++ llvm/trunk/include/llvm/Target/TargetLowering.h Mon Apr 21 14:34:27
> 2014
> @@ -182,6 +182,9 @@ public:
> return HasMultipleConditionRegisters;
> }
>
> + /// Return true if the target has BitExtract instructions.
> + bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
> +
> /// Return true if a vector of the given type should be split
> /// (TypeSplitVector) instead of promoted (TypePromoteInteger) during
> type
> /// legalization.
> @@ -1010,6 +1013,14 @@ protected:
> HasMultipleConditionRegisters = hasManyRegs;
> }
>
> + /// Tells the code generator that the target has BitExtract
> instructions.
> + /// The code generator will aggressively sink "shift"s into the blocks
> of
> + /// their users if the users will generate "and" instructions which can
> be
> + /// combined with "shift" to BitExtract instructions.
> + void setHasExtractBitsInsn(bool hasExtractInsn = true) {
> + HasExtractBitsInsn = hasExtractInsn;
> + }
> +
> /// Tells the code generator not to expand sequence of operations into a
> /// separate sequences that increases the amount of flow control.
> void setJumpIsExpensive(bool isExpensive = true) {
> @@ -1436,6 +1447,12 @@ private:
> /// the blocks of their users.
> bool HasMultipleConditionRegisters;
>
> + /// Tells the code generator that the target has BitExtract
> instructions.
> + /// The code generator will aggressively sink "shift"s into the blocks
> of
> + /// their users if the users will generate "and" instructions which can
> be
> + /// combined with "shift" to BitExtract instructions.
> + bool HasExtractBitsInsn;
> +
> /// Tells the code generator not to expand integer divides by constants
> into a
> /// sequence of muls, adds, and shifts. This is a hack until a real
> cost
> /// model is in place. If we ever optimize for size, this will be set
> to true
>
> Modified: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp?rev=206774&r1=206773&r2=206774&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp (original)
> +++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp Mon Apr 21 14:34:27 2014
> @@ -628,6 +628,187 @@ static bool OptimizeCmpExpression(CmpIns
> return MadeChange;
> }
>
> +/// isExtractBitsCandidateUse - Check if the candidates could
> +/// be combined with shift instruction, which includes:
> +/// 1. Truncate instruction
> +/// 2. And instruction and the imm is a mask of the low bits:
> +/// imm & (imm+1) == 0
> +bool isExtractBitsCandidateUse(Instruction *User) {
> + if (!isa<TruncInst>(User)) {
> + if (User->getOpcode() != Instruction::And ||
> + !isa<ConstantInt>(User->getOperand(1)))
> + return false;
> +
> + unsigned Cimm =
> dyn_cast<ConstantInt>(User->getOperand(1))->getZExtValue();
> +
> + if (Cimm & (Cimm + 1))
> + return false;
> + }
> + return true;
> +}
> +
> +/// SinkShiftAndTruncate - sink both shift and truncate instruction
> +/// to the use of truncate's BB.
> +bool
> +SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User,
> ConstantInt *CI,
> + DenseMap<BasicBlock *, BinaryOperator *>
> &InsertedShifts,
> + const TargetLowering &TLI) {
> + BasicBlock *UserBB = User->getParent();
> + DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
> + TruncInst *TruncI = dyn_cast<TruncInst>(User);
> + bool MadeChange = false;
> +
> + for (Value::user_iterator TruncUI = TruncI->user_begin(),
> + TruncE = TruncI->user_end();
> + TruncUI != TruncE;) {
> +
> + Use &TruncTheUse = TruncUI.getUse();
> + Instruction *TruncUser = cast<Instruction>(*TruncUI);
> + // Preincrement use iterator so we don't invalidate it.
> +
> + ++TruncUI;
> +
> + int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
> + if (!ISDOpcode)
> + continue;
> +
> + // If the use is actually a legal node, there will not be an implicit
> + // truncate.
> + if (TLI.isOperationLegalOrCustom(ISDOpcode,
> + EVT::getEVT(TruncUser->getType())))
> + continue;
> +
> + // Don't bother for PHI nodes.
> + if (isa<PHINode>(TruncUser))
> + continue;
> +
> + BasicBlock *TruncUserBB = TruncUser->getParent();
> +
> + if (UserBB == TruncUserBB)
> + continue;
> +
> + BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
> + CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
> +
> + if (!InsertedShift && !InsertedTrunc) {
> + BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
> + // Sink the shift
> + if (ShiftI->getOpcode() == Instruction::AShr)
> + InsertedShift =
> + BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "",
> InsertPt);
> + else
> + InsertedShift =
> + BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "",
> InsertPt);
> +
> + // Sink the trunc
> + BasicBlock::iterator TruncInsertPt =
> TruncUserBB->getFirstInsertionPt();
> + TruncInsertPt++;
> +
> + InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
> + TruncI->getType(), "",
> TruncInsertPt);
> +
> + MadeChange = true;
> +
> + TruncTheUse = InsertedTrunc;
> + }
> + }
> + return MadeChange;
> +}
> +
> +/// OptimizeExtractBits - sink the shift *right* instruction into user
> blocks if
> +/// the uses could potentially be combined with this shift instruction and
> +/// generate BitExtract instruction. It will only be applied if the
> architecture
> +/// supports BitExtract instruction. Here is an example:
> +/// BB1:
> +/// %x.extract.shift = lshr i64 %arg1, 32
> +/// BB2:
> +/// %x.extract.trunc = trunc i64 %x.extract.shift to i16
> +/// ==>
> +///
> +/// BB2:
> +/// %x.extract.shift.1 = lshr i64 %arg1, 32
> +/// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
> +///
> +/// CodeGen will recoginze the pattern in BB2 and generate BitExtract
> +/// instruction.
> +/// Return true if any changes are made.
> +static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
> + const TargetLowering &TLI) {
> + BasicBlock *DefBB = ShiftI->getParent();
> +
> + /// Only insert instructions in each block once.
> + DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
> +
> + bool shiftIsLegal =
> TLI.isTypeLegal(TLI.getValueType(ShiftI->getType()));
> +
> + bool MadeChange = false;
> + for (Value::user_iterator UI = ShiftI->user_begin(), E =
> ShiftI->user_end();
> + UI != E;) {
> + Use &TheUse = UI.getUse();
> + Instruction *User = cast<Instruction>(*UI);
> + // Preincrement use iterator so we don't invalidate it.
> + ++UI;
> +
> + // Don't bother for PHI nodes.
> + if (isa<PHINode>(User))
> + continue;
> +
> + if (!isExtractBitsCandidateUse(User))
> + continue;
> +
> + BasicBlock *UserBB = User->getParent();
> +
> + if (UserBB == DefBB) {
> + // If the shift and truncate instruction are in the same BB. The
> use of
> + // the truncate(TruncUse) may still introduce another truncate if
> not
> + // legal. In this case, we would like to sink both shift and
> truncate
> + // instruction to the BB of TruncUse.
> + // for example:
> + // BB1:
> + // i64 shift.result = lshr i64 opnd, imm
> + // trunc.result = trunc shift.result to i16
> + //
> + // BB2:
> + // ----> We will have an implicit truncate here if the
> architecture does
> + // not have i16 compare.
> + // cmp i16 trunc.result, opnd2
> + //
> + if (isa<TruncInst>(User) && shiftIsLegal
> + // If the type of the truncate is legal, no trucate will be
> + // introduced in other basic blocks.
> + && (!TLI.isTypeLegal(TLI.getValueType(User->getType()))))
> + MadeChange =
> + SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI);
> +
> + continue;
> + }
> + // If we have already inserted a shift into this block, use it.
> + BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
> +
> + if (!InsertedShift) {
> + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
> +
> + if (ShiftI->getOpcode() == Instruction::AShr)
> + InsertedShift =
> + BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "",
> InsertPt);
> + else
> + InsertedShift =
> + BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "",
> InsertPt);
> +
> + MadeChange = true;
> + }
> +
> + // Replace a use of the shift with a use of the new shift.
> + TheUse = InsertedShift;
> + }
> +
> + // If we removed all uses, nuke the shift.
> + if (ShiftI->use_empty())
> + ShiftI->eraseFromParent();
> +
> + return MadeChange;
> +}
> +
> namespace {
> class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
> protected:
> @@ -3225,6 +3406,17 @@ bool CodeGenPrepare::OptimizeInst(Instru
> return false;
> }
>
> + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
> +
> + if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
> + BinOp->getOpcode() == Instruction::LShr)) {
> + ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
> + if (TLI && CI && TLI->hasExtractBitsInsn())
> + return OptimizeExtractBits(BinOp, CI, *TLI);
> +
> + return false;
> + }
> +
> if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
> if (GEPI->hasAllZeroIndices()) {
> /// The GEP operand must be a pointer, so must its result -> BitCast
>
> Modified: llvm/trunk/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp?rev=206774&r1=206773&r2=206774&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp (original)
> +++ llvm/trunk/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp Mon Apr 21 14:34:27
> 2014
> @@ -1183,6 +1183,14 @@ static bool isBitfieldExtractOpFromAnd(S
> // Make sure to clamp the MSB so that we preserve the semantics of the
> // original operations.
> ClampMSB = true;
> + } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
> + isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
> + Srl_imm)) {
> + // If the shift result was truncated, we can still combine them.
> + Opd0 = Op0->getOperand(0).getOperand(0);
> +
> + // Use the type of SRL node.
> + VT = Opd0->getValueType(0);
> } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
> Opd0 = Op0->getOperand(0);
> } else if (BiggerPattern) {
> @@ -1277,8 +1285,19 @@ static bool isBitfieldExtractOpFromShr(S
>
> // we're looking for a shift of a shift
> uint64_t Shl_imm = 0;
> + uint64_t Trunc_bits = 0;
> if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL,
> Shl_imm)) {
> Opd0 = N->getOperand(0).getOperand(0);
> + } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
> + N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
> + // We are looking for a shift of truncate. Truncate from i64 to i32
> could
> + // be considered as setting high 32 bits as zero. Our strategy here
> is to
> + // always generate 64bit UBFM. This consistency will help the CSE pass
> + // later find more redundancy.
> + Opd0 = N->getOperand(0).getOperand(0);
> + Trunc_bits = Opd0->getValueType(0).getSizeInBits() -
> VT.getSizeInBits();
> + VT = Opd0->getValueType(0);
> + assert(VT == MVT::i64 && "the promoted type should be i64");
> } else if (BiggerPattern) {
> // Let's pretend a 0 shift left has been performed.
> // FIXME: Currently we limit this to the bigger pattern case,
> @@ -1295,7 +1314,7 @@ static bool isBitfieldExtractOpFromShr(S
> assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
> "bad amount in shift node!");
> // Note: The width operand is encoded as width-1.
> - unsigned Width = VT.getSizeInBits() - Srl_imm - 1;
> + unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
> int sLSB = Srl_imm - Shl_imm;
> if (sLSB < 0)
> return false;
> @@ -1354,8 +1373,23 @@ SDNode *ARM64DAGToDAGISel::SelectBitfiel
> return NULL;
>
> EVT VT = N->getValueType(0);
> - SDValue Ops[] = { Opd0, CurDAG->getTargetConstant(LSB, VT),
> - CurDAG->getTargetConstant(MSB, VT) };
> +
> + // If the bit extract operation is 64bit but the original type is
> 32bit, we
> + // need to add one EXTRACT_SUBREG.
> + if ((Opc == ARM64::SBFMXri || Opc == ARM64::UBFMXri) && VT == MVT::i32)
> {
> + SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
> + CurDAG->getTargetConstant(MSB, MVT::i64)};
> +
> + SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
> + SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
> + MachineSDNode *Node =
> + CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N),
> MVT::i32,
> + SDValue(BFM, 0), SubReg);
> + return Node;
> + }
> +
> + SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
> + CurDAG->getTargetConstant(MSB, VT)};
> return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
> }
>
>
> Modified: llvm/trunk/lib/Target/ARM64/ARM64ISelLowering.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/ARM64ISelLowering.cpp?rev=206774&r1=206773&r2=206774&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM64/ARM64ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/ARM64/ARM64ISelLowering.cpp Mon Apr 21 14:34:27
> 2014
> @@ -438,6 +438,8 @@ ARM64TargetLowering::ARM64TargetLowering
> setDivIsWellDefined(true);
>
> RequireStrictAlign = StrictAlign;
> +
> + setHasExtractBitsInsn(true);
> }
>
> void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
>
> Modified: llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll?rev=206774&r1=206773&r2=206774&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll (original)
> +++ llvm/trunk/test/CodeGen/ARM64/bitfield-extract.ll Mon Apr 21 14:34:27
> 2014
> @@ -1,3 +1,4 @@
> +; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s |
> FileCheck --check-prefix=OPT %s
> ; RUN: llc < %s -march=arm64 | FileCheck %s
> %struct.X = type { i8, i8, [2 x i8] }
> %struct.Y = type { i32, i8 }
> @@ -404,3 +405,75 @@ define i64 @fct18(i32 %xor72) nounwind s
> %result = and i64 %conv82, 255
> ret i64 %result
> }
> +
> +; Using the access to the global array to keep the instruction and
> control flow.
> + at first_ones = external global [65536 x i8]
> +
> +; Function Attrs: nounwind readonly ssp
> +define i32 @fct19(i64 %arg1) nounwind readonly ssp {
> +; CHECK-LABEL: fct19:
> +entry:
> + %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
> + %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
> + %x.sroa.3.0.extract.shift = lshr i64 %arg1, 32
> + %x.sroa.5.0.extract.shift = lshr i64 %arg1, 48
> + %tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0
> + br i1 %tobool, label %if.end, label %if.then
> +
> +if.then: ; preds = %entry
> + %arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0,
> i64 %x.sroa.5.0.extract.shift
> + %0 = load i8* %arrayidx3, align 1
> + %conv = zext i8 %0 to i32
> + br label %return
> +
> +; OPT-LABEL: if.end
> +if.end: ; preds = %entry
> +; OPT: lshr
> +; CHECK: ubfm [[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #47
> + %x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16
> + %tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0
> +; CHECK: cbz
> + br i1 %tobool6, label %if.end13, label %if.then7
> +
> +; OPT-LABEL: if.then7
> +if.then7: ; preds = %if.end
> +; OPT: lshr
> +; "and" should be combined to "ubfm" while "ubfm" should be removed by
> cse.
> +; So neither of them should be in the assemble code.
> +; CHECK-NOT: and
> +; CHECK-NOT: ubfm
> + %idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535
> + %arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0,
> i64 %idxprom10
> + %1 = load i8* %arrayidx11, align 1
> + %conv12 = zext i8 %1 to i32
> + %add = add nsw i32 %conv12, 16
> + br label %return
> +
> +; OPT-LABEL: if.end13
> +if.end13: ; preds = %if.end
> +; OPT: lshr
> +; OPT: trunc
> +; CHECK: ubfm [[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #31
> + %tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0
> +; CHECK: cbz
> + br i1 %tobool16, label %return, label %if.then17
> +
> +; OPT-LABEL: if.then17
> +if.then17: ; preds = %if.end13
> +; OPT: lshr
> +; "and" should be combined to "ubfm" while "ubfm" should be removed by
> cse.
> +; So neither of them should be in the assemble code.
> +; CHECK-NOT: and
> +; CHECK-NOT: ubfm
> + %idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535
> + %arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0,
> i64 %idxprom20
> + %2 = load i8* %arrayidx21, align 1
> + %conv22 = zext i8 %2 to i32
> + %add23 = add nsw i32 %conv22, 32
> + br label %return
> +
> +return: ; preds = %if.end13,
> %if.then17, %if.then7, %if.then
> +; CHECK: ret
> + %retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23,
> %if.then17 ], [ 64, %if.end13 ]
> + ret i32 %retval.0
> +}
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140421/d9d4936a/attachment.html>
More information about the llvm-commits
mailing list