[llvm] r224169 - [PowerPC] Add a DAGToDAG peephole to remove unnecessary zero-exts
Hal Finkel
hfinkel at anl.gov
Fri Dec 12 15:59:36 PST 2014
Author: hfinkel
Date: Fri Dec 12 17:59:36 2014
New Revision: 224169
URL: http://llvm.org/viewvc/llvm-project?rev=224169&view=rev
Log:
[PowerPC] Add a DAGToDAG peephole to remove unnecessary zero-exts
On PPC64, we end up with lots of i32 -> i64 zero extensions, not only from all
of the usual places, but also from the ABI, which specifies that values passed
are zero extended. Almost all 32-bit PPC instructions in PPC64 mode are defined
to do *something* to the higher-order bits, and for some instructions, that
action clears those bits (thus providing a zero-extended result). This is
especially common after rotate-and-mask instructions. Adding an additional
instruction to zero-extend the results of these instructions is unnecessary.
This PPCISelDAGToDAG peephole optimization examines these zero-extensions, and
looks back through their operands to see if all instructions will implicitly
zero extend their results. If so, we convert these instructions to their 64-bit
variants (which is an internal change only, the actual encoding of these
instructions is the same as the original 32-bit ones) and remove the
unnecessary zero-extension (changing where the INSERT_SUBREG instructions are
to make everything internally consistent).
Added:
llvm/trunk/test/CodeGen/PowerPC/rm-zext.ll
Modified:
llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
llvm/trunk/lib/Target/PowerPC/PPCInstr64Bit.td
llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.cpp
Modified: llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp?rev=224169&r1=224168&r2=224169&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp Fri Dec 12 17:59:36 2014
@@ -205,6 +205,7 @@ private:
SDNode *SelectSETCC(SDNode *N);
void PeepholePPC64();
+ void PeepholePPC64ZExt();
void PeepholeCROps();
bool AllUsersSelectZero(SDNode *N);
@@ -1628,6 +1629,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG
PeepholePPC64();
PeepholeCROps();
+ PeepholePPC64ZExt();
}
// Check if all users of this node will become isel where the second operand
@@ -2101,6 +2103,299 @@ void PPCDAGToDAGISel::PeepholeCROps() {
} while (IsModified);
}
+// Gather the set of 32-bit operations that are known to have their
+// higher-order 32 bits zero, where ToPromote contains all such operations.
+static bool PeepholePPC64ZExtGather(SDValue Op32,
+ SmallPtrSetImpl<SDNode *> &ToPromote) {
+ if (!Op32.isMachineOpcode())
+ return false;
+
+ // First, check for the "frontier" instructions (those that will clear the
+ // higher-order 32 bits.
+
+ // For RLWINM and RLWNM, we need to make sure that the mask does not wrap
+ // around. If it does not, then these instructions will clear the
+ // higher-order bits.
+ if ((Op32.getMachineOpcode() == PPC::RLWINM ||
+ Op32.getMachineOpcode() == PPC::RLWNM) &&
+ Op32.getConstantOperandVal(2) <= Op32.getConstantOperandVal(3)) {
+ ToPromote.insert(Op32.getNode());
+ return true;
+ }
+
+ // SLW and SRW always clear the higher-order bits.
+ if (Op32.getMachineOpcode() == PPC::SLW ||
+ Op32.getMachineOpcode() == PPC::SRW) {
+ ToPromote.insert(Op32.getNode());
+ return true;
+ }
+
+ // For LI and LIS, we need the immediate to be positive (so that it is not
+ // sign extended).
+ if (Op32.getMachineOpcode() == PPC::LI ||
+ Op32.getMachineOpcode() == PPC::LIS) {
+ if (!isUInt<15>(Op32.getConstantOperandVal(0)))
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+ return true;
+ }
+
+ // Next, check for those instructions we can look through.
+
+ // Assuming the mask does not wrap around, then the higher-order bits are
+ // taken directly from the first operand.
+ if (Op32.getMachineOpcode() == PPC::RLWIMI &&
+ Op32.getConstantOperandVal(3) <= Op32.getConstantOperandVal(4)) {
+ SmallPtrSet<SDNode *, 16> ToPromote1;
+ if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+ ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+ return true;
+ }
+
+ // For OR, the higher-order bits are zero if that is true for both operands.
+ // For SELECT_I4, the same is true (but the relevant operand numbers are
+ // shifted by 1).
+ if (Op32.getMachineOpcode() == PPC::OR ||
+ Op32.getMachineOpcode() == PPC::SELECT_I4) {
+ unsigned B = Op32.getMachineOpcode() == PPC::SELECT_I4 ? 1 : 0;
+ SmallPtrSet<SDNode *, 16> ToPromote1;
+ if (!PeepholePPC64ZExtGather(Op32.getOperand(B+0), ToPromote1))
+ return false;
+ if (!PeepholePPC64ZExtGather(Op32.getOperand(B+1), ToPromote1))
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+ ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+ return true;
+ }
+
+ // For ORI and ORIS, we need the higher-order bits of the first operand to be
+ // zero, and also for the constant to be positive (so that it is not sign
+ // extended).
+ if (Op32.getMachineOpcode() == PPC::ORI ||
+ Op32.getMachineOpcode() == PPC::ORIS) {
+ SmallPtrSet<SDNode *, 16> ToPromote1;
+ if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
+ return false;
+ if (!isUInt<15>(Op32.getConstantOperandVal(1)))
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+ ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+ return true;
+ }
+
+ // The higher-order bits of AND are zero if that is true for at least one of
+ // the operands.
+ if (Op32.getMachineOpcode() == PPC::AND) {
+ SmallPtrSet<SDNode *, 16> ToPromote1, ToPromote2;
+ bool Op0OK =
+ PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
+ bool Op1OK =
+ PeepholePPC64ZExtGather(Op32.getOperand(1), ToPromote2);
+ if (!Op0OK && !Op1OK)
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+
+ if (Op0OK)
+ ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+
+ if (Op1OK)
+ ToPromote.insert(ToPromote2.begin(), ToPromote2.end());
+
+ return true;
+ }
+
+ // For ANDI and ANDIS, the higher-order bits are zero if either that is true
+ // of the first operand, or if the second operand is positive (so that it is
+ // not sign extended).
+ if (Op32.getMachineOpcode() == PPC::ANDIo ||
+ Op32.getMachineOpcode() == PPC::ANDISo) {
+ SmallPtrSet<SDNode *, 16> ToPromote1;
+ bool Op0OK =
+ PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
+ bool Op1OK = isUInt<15>(Op32.getConstantOperandVal(1));
+ if (!Op0OK && !Op1OK)
+ return false;
+
+ ToPromote.insert(Op32.getNode());
+
+ if (Op0OK)
+ ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+
+ return true;
+ }
+
+ return false;
+}
+
+void PPCDAGToDAGISel::PeepholePPC64ZExt() {
+ if (!PPCSubTarget->isPPC64())
+ return;
+
+ // When we zero-extend from i32 to i64, we use a pattern like this:
+ // def : Pat<(i64 (zext i32:$in)),
+ // (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
+ // 0, 32)>;
+ // There are several 32-bit shift/rotate instructions, however, that will
+ // clear the higher-order bits of their output, rendering the RLDICL
+ // unnecessary. When that happens, we remove it here, and redefine the
+ // relevant 32-bit operation to be a 64-bit operation.
+
+ SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+ ++Position;
+
+ bool MadeChange = false;
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = --Position;
+ // Skip dead nodes and any non-machine opcodes.
+ if (N->use_empty() || !N->isMachineOpcode())
+ continue;
+
+ if (N->getMachineOpcode() != PPC::RLDICL)
+ continue;
+
+ if (N->getConstantOperandVal(1) != 0 ||
+ N->getConstantOperandVal(2) != 32)
+ continue;
+
+ SDValue ISR = N->getOperand(0);
+ if (!ISR.isMachineOpcode() ||
+ ISR.getMachineOpcode() != TargetOpcode::INSERT_SUBREG)
+ continue;
+
+ if (!ISR.hasOneUse())
+ continue;
+
+ if (ISR.getConstantOperandVal(2) != PPC::sub_32)
+ continue;
+
+ SDValue IDef = ISR.getOperand(0);
+ if (!IDef.isMachineOpcode() ||
+ IDef.getMachineOpcode() != TargetOpcode::IMPLICIT_DEF)
+ continue;
+
+ // We now know that we're looking at a canonical i32 -> i64 zext. See if we
+ // can get rid of it.
+
+ SDValue Op32 = ISR->getOperand(1);
+ if (!Op32.isMachineOpcode())
+ continue;
+
+ // There are some 32-bit instructions that always clear the high-order 32
+ // bits, there are also some instructions (like AND) that we can look
+ // through.
+ SmallPtrSet<SDNode *, 16> ToPromote;
+ if (!PeepholePPC64ZExtGather(Op32, ToPromote))
+ continue;
+
+ // If the ToPromote set contains nodes that have uses outside of the set
+ // (except for the original INSERT_SUBREG), then abort the transformation.
+ bool OutsideUse = false;
+ for (SDNode *PN : ToPromote) {
+ for (SDNode *UN : PN->uses()) {
+ if (!ToPromote.count(UN) && UN != ISR.getNode()) {
+ OutsideUse = true;
+ break;
+ }
+ }
+
+ if (OutsideUse)
+ break;
+ }
+ if (OutsideUse)
+ continue;
+
+ MadeChange = true;
+
+ // We now know that this zero extension can be removed by promoting to
+ // nodes in ToPromote to 64-bit operations, where for operations in the
+ // frontier of the set, we need to insert INSERT_SUBREGs for their
+ // operands.
+ for (SDNode *PN : ToPromote) {
+ unsigned NewOpcode;
+ switch (PN->getMachineOpcode()) {
+ default:
+ llvm_unreachable("Don't know the 64-bit variant of this instruction");
+ case PPC::RLWINM: NewOpcode = PPC::RLWINM8; break;
+ case PPC::RLWNM: NewOpcode = PPC::RLWNM8; break;
+ case PPC::SLW: NewOpcode = PPC::SLW8; break;
+ case PPC::SRW: NewOpcode = PPC::SRW8; break;
+ case PPC::LI: NewOpcode = PPC::LI8; break;
+ case PPC::LIS: NewOpcode = PPC::LIS8; break;
+ case PPC::RLWIMI: NewOpcode = PPC::RLWIMI8; break;
+ case PPC::OR: NewOpcode = PPC::OR8; break;
+ case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break;
+ case PPC::ORI: NewOpcode = PPC::ORI8; break;
+ case PPC::ORIS: NewOpcode = PPC::ORIS8; break;
+ case PPC::AND: NewOpcode = PPC::AND8; break;
+ case PPC::ANDIo: NewOpcode = PPC::ANDIo8; break;
+ case PPC::ANDISo: NewOpcode = PPC::ANDISo8; break;
+ }
+
+ // Note: During the replacement process, the nodes will be in an
+ // inconsistent state (some instructions will have operands with values
+ // of the wrong type). Once done, however, everything should be right
+ // again.
+
+ SmallVector<SDValue, 4> Ops;
+ for (const SDValue &V : PN->ops()) {
+ if (!ToPromote.count(V.getNode()) && V.getValueType() == MVT::i32 &&
+ !isa<ConstantSDNode>(V)) {
+ SDValue ReplOpOps[] = { ISR.getOperand(0), V, ISR.getOperand(2) };
+ SDNode *ReplOp =
+ CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(V),
+ ISR.getNode()->getVTList(), ReplOpOps);
+ Ops.push_back(SDValue(ReplOp, 0));
+ } else {
+ Ops.push_back(V);
+ }
+ }
+
+ // Because all to-be-promoted nodes only have users that are other
+ // promoted nodes (or the original INSERT_SUBREG), we can safely replace
+ // the i32 result value type with i64.
+
+ SmallVector<EVT, 2> NewVTs;
+ SDVTList VTs = PN->getVTList();
+ for (unsigned i = 0, ie = VTs.NumVTs; i != ie; ++i)
+ if (VTs.VTs[i] == MVT::i32)
+ NewVTs.push_back(MVT::i64);
+ else
+ NewVTs.push_back(VTs.VTs[i]);
+
+ DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld: ");
+ DEBUG(PN->dump(CurDAG));
+
+ CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops);
+
+ DEBUG(dbgs() << "\nNew: ");
+ DEBUG(PN->dump(CurDAG));
+ DEBUG(dbgs() << "\n");
+ }
+
+ // Now we replace the original zero extend and its associated INSERT_SUBREG
+ // with the value feeding the INSERT_SUBREG (which has now been promoted to
+ // return an i64).
+
+ DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld: ");
+ DEBUG(N->dump(CurDAG));
+ DEBUG(dbgs() << "\nNew: ");
+ DEBUG(Op32.getNode()->dump(CurDAG));
+ DEBUG(dbgs() << "\n");
+
+ ReplaceUses(N, Op32.getNode());
+ }
+
+ if (MadeChange)
+ CurDAG->RemoveDeadNodes();
+}
+
void PPCDAGToDAGISel::PeepholePPC64() {
// These optimizations are currently supported only for 64-bit SVR4.
if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
Modified: llvm/trunk/lib/Target/PowerPC/PPCInstr64Bit.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCInstr64Bit.td?rev=224169&r1=224168&r2=224169&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCInstr64Bit.td (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCInstr64Bit.td Fri Dec 12 17:59:36 2014
@@ -547,6 +547,11 @@ defm EXTSB8 : XForm_11r<31, 954, (outs g
defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
"extsh", "$rA, $rS", IIC_IntSimple,
[(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
+
+defm SLW8 : XForm_6r<31, 24, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "slw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
+defm SRW8 : XForm_6r<31, 536, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+ "srw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
} // Interpretation64Bit
// For fast-isel:
@@ -645,7 +650,11 @@ defm RLWINM8 : MForm_2r<21, (outs g8rc:$
"rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
[]>;
-let isCommutable = 1 in {
+defm RLWNM8 : MForm_2r<23, (outs g8rc:$rA),
+ (ins g8rc:$rS, g8rc:$rB, u5imm:$MB, u5imm:$ME),
+ "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
+ []>;
+
// RLWIMI can be commuted if the rotate amount is zero.
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
@@ -653,7 +662,6 @@ defm RLWIMI8 : MForm_2r<20, (outs g8rc:$
u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
IIC_IntRotate, []>, PPC970_DGroup_Cracked,
RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
-}
let isSelect = 1 in
def ISEL8 : AForm_4<31, 15,
Modified: llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.cpp?rev=224169&r1=224168&r2=224169&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.cpp Fri Dec 12 17:59:36 2014
@@ -230,10 +230,12 @@ PPCInstrInfo::commuteInstruction(Machine
// Normal instructions can be commuted the obvious way.
if (MI->getOpcode() != PPC::RLWIMI &&
- MI->getOpcode() != PPC::RLWIMIo &&
- MI->getOpcode() != PPC::RLWIMI8 &&
- MI->getOpcode() != PPC::RLWIMI8o)
+ MI->getOpcode() != PPC::RLWIMIo)
return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
+ // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
+ // changing the relative order of the mask operands might change what happens
+ // to the high-bits of the mask (and, thus, the result).
// Cannot commute if it has a non-zero rotate count.
if (MI->getOperand(3).getImm() != 0)
Added: llvm/trunk/test/CodeGen/PowerPC/rm-zext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/rm-zext.ll?rev=224169&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/rm-zext.ll (added)
+++ llvm/trunk/test/CodeGen/PowerPC/rm-zext.ll Fri Dec 12 17:59:36 2014
@@ -0,0 +1,32 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define signext i32 @foo(i32 signext %a) #0 {
+entry:
+ %mul = mul nsw i32 %a, %a
+ %shr2 = lshr i32 %mul, 5
+ ret i32 %shr2
+
+; CHECK-LABEL @foo
+; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
+; CHECK: blr
+}
+
+define zeroext i32 @test6(i32 zeroext %x) #0 {
+entry:
+ %and = lshr i32 %x, 16
+ %shr = and i32 %and, 255
+ %and1 = shl i32 %x, 16
+ %shl = and i32 %and1, 16711680
+ %or = or i32 %shr, %shl
+ ret i32 %or
+
+; CHECK-LABEL @test6
+; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+
More information about the llvm-commits
mailing list