[llvm] r259009 - [WebAssembly] Enhanced register stackification

Dan Gohman via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 27 17:22:44 PST 2016


Author: djg
Date: Wed Jan 27 19:22:44 2016
New Revision: 259009

URL: http://llvm.org/viewvc/llvm-project?rev=259009&view=rev
Log:
[WebAssembly] Enhanced register stackification

This patch revamps the RegStackifier pass with a new tree traversal mechanism,
enabling three major new features:

 - Stackification of values with multiple uses, using the result value of set_local
 - More aggressive stackification of instructions with side effects
 - Reordering operands in commutative instructions to enable more stackification.

Modified:
    llvm/trunk/include/llvm/CodeGen/MachineInstr.h
    llvm/trunk/lib/Target/WebAssembly/README.txt
    llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
    llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
    llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
    llvm/trunk/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
    llvm/trunk/test/CodeGen/WebAssembly/cfg-stackify.ll
    llvm/trunk/test/CodeGen/WebAssembly/reg-stackify.ll
    llvm/trunk/test/CodeGen/WebAssembly/varargs.ll

Modified: llvm/trunk/include/llvm/CodeGen/MachineInstr.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/MachineInstr.h?rev=259009&r1=259008&r2=259009&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/MachineInstr.h (original)
+++ llvm/trunk/include/llvm/CodeGen/MachineInstr.h Wed Jan 27 19:22:44 2016
@@ -343,6 +343,14 @@ public:
     return make_range(operands_begin() + getDesc().getNumDefs(),
                       operands_end());
   }
+  iterator_range<mop_iterator> explicit_uses() {
+    return make_range(operands_begin() + getDesc().getNumDefs(),
+                      operands_begin() + getNumExplicitOperands() );
+  }
+  iterator_range<const_mop_iterator> explicit_uses() const {
+    return make_range(operands_begin() + getDesc().getNumDefs(),
+                      operands_begin() + getNumExplicitOperands() );
+  }
 
   /// Returns the number of the operand iterator \p I points to.
   unsigned getOperandNo(const_mop_iterator I) const {

Modified: llvm/trunk/lib/Target/WebAssembly/README.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/WebAssembly/README.txt?rev=259009&r1=259008&r2=259009&view=diff
==============================================================================
--- llvm/trunk/lib/Target/WebAssembly/README.txt (original)
+++ llvm/trunk/lib/Target/WebAssembly/README.txt Wed Jan 27 19:22:44 2016
@@ -32,12 +32,6 @@ Interesting work that remains to be done
 
 //===---------------------------------------------------------------------===//
 
-set_local instructions have a return value. We should (a) model this,
-and (b) write optimizations which take advantage of it. Keep in mind that
-many set_local instructions are implicit!
-
-//===---------------------------------------------------------------------===//
-
 Br, br_if, and tableswitch instructions can support having a value on the
 expression stack across the jump (sometimes). We should (a) model this, and
 (b) extend the stackifier to utilize it.
@@ -87,3 +81,17 @@ Find a clean way to fix the problem whic
 being run after the WebAssembly PEI pass.
 
 //===---------------------------------------------------------------------===//
+
+When setting multiple local variables to the same constant, we currently get
+code like this:
+
+    i32.const   $4=, 0
+    i32.const   $3=, 0
+
+It could be done with a smaller encoding like this:
+
+    i32.const   $push5=, 0
+    tee_local   $push6=, $4=, $pop5
+    copy_local  $3=, $pop6
+
+//===---------------------------------------------------------------------===//

Modified: llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp?rev=259009&r1=259008&r2=259009&view=diff
==============================================================================
--- llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp Wed Jan 27 19:22:44 2016
@@ -15,6 +15,7 @@
 
 #include "WebAssemblyInstrInfo.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -75,6 +76,21 @@ void WebAssemblyInstrInfo::copyPhysReg(M
       .addReg(SrcReg, KillSrc ? RegState::Kill : 0);
 }
 
+MachineInstr *
+WebAssemblyInstrInfo::commuteInstructionImpl(MachineInstr *MI, bool NewMI,
+                                             unsigned OpIdx1,
+                                             unsigned OpIdx2) const {
+  // If the operands are stackified, we can't reorder them.
+  WebAssemblyFunctionInfo &MFI =
+      *MI->getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
+  if (MFI.isVRegStackified(MI->getOperand(OpIdx1).getReg()) ||
+      MFI.isVRegStackified(MI->getOperand(OpIdx2).getReg()))
+    return nullptr;
+
+  // Otherwise use the default implementation.
+  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
 // Branch analysis.
 bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                          MachineBasicBlock *&TBB,

Modified: llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.h?rev=259009&r1=259008&r2=259009&view=diff
==============================================================================
--- llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.h (original)
+++ llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.h Wed Jan 27 19:22:44 2016
@@ -40,6 +40,9 @@ public:
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    DebugLoc DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
+  MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI,
+                                       unsigned OpIdx1,
+                                       unsigned OpIdx2) const override;
 
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,

Modified: llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.td?rev=259009&r1=259008&r2=259009&view=diff
==============================================================================
--- llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.td (original)
+++ llvm/trunk/lib/Target/WebAssembly/WebAssemblyInstrInfo.td Wed Jan 27 19:22:44 2016
@@ -107,15 +107,8 @@ defm : ARGUMENT<F64>;
 let Defs = [ARGUMENTS] in {
 
 // get_local and set_local are not generated by instruction selection; they
-// are implied by virtual register uses and defs in most contexts. However,
-// they are explicitly emitted for special purposes.
+// are implied by virtual register uses and defs.
 multiclass LOCAL<WebAssemblyRegClass vt> {
-  def GET_LOCAL_#vt : I<(outs vt:$res), (ins i32imm:$regno), [],
-                        "get_local\t$res, $regno">;
-  // TODO: set_local returns its operand value
-  def SET_LOCAL_#vt : I<(outs), (ins i32imm:$regno, vt:$src), [],
-                        "set_local\t$regno, $src">;
-
   // COPY_LOCAL is not an actual instruction in wasm, but since we allow
   // get_local and set_local to be implicit, we can have a COPY_LOCAL which
   // is actually a no-op because all the work is done in the implied
@@ -123,6 +116,13 @@ multiclass LOCAL<WebAssemblyRegClass vt>
   let isAsCheapAsAMove = 1 in
   def COPY_LOCAL_#vt : I<(outs vt:$res), (ins vt:$src), [],
                          "copy_local\t$res, $src">;
+
+  // TEE_LOCAL is similar to COPY_LOCAL, but writes two copies of its result.
+  // Typically this would be used to stackify one result and write the other
+  // result to a local.
+  let isAsCheapAsAMove = 1 in
+  def TEE_LOCAL_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), [],
+                        "tee_local\t$res, $also, $src">;
 }
 defm : LOCAL<I32>;
 defm : LOCAL<I64>;

Modified: llvm/trunk/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp?rev=259009&r1=259008&r2=259009&view=diff
==============================================================================
--- llvm/trunk/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp (original)
+++ llvm/trunk/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp Wed Jan 27 19:22:44 2016
@@ -27,6 +27,8 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
@@ -44,12 +46,13 @@ class WebAssemblyRegStackify final : pub
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MachineDominatorTree>();
     AU.addRequired<LiveIntervals>();
     AU.addPreserved<MachineBlockFrequencyInfo>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
-    AU.addPreservedID(MachineDominatorsID);
     AU.addPreservedID(LiveVariablesID);
+    AU.addPreserved<MachineDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -89,8 +92,8 @@ static void ImposeStackOrdering(MachineI
 // TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
 // more precise.
 static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
-                         AliasAnalysis &AA, LiveIntervals &LIS,
-                         MachineRegisterInfo &MRI) {
+                         AliasAnalysis &AA, const LiveIntervals &LIS,
+                         const MachineRegisterInfo &MRI) {
   assert(Def->getParent() == Insert->getParent());
   bool SawStore = false, SawSideEffects = false;
   MachineBasicBlock::const_iterator D(Def), I(Insert);
@@ -133,6 +136,250 @@ static bool IsSafeToMove(const MachineIn
          !(SawSideEffects && !Def->isSafeToMove(&AA, SawStore));
 }
 
+/// Test whether OneUse, a use of Reg, dominates all of Reg's other uses.
+static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
+                                     const MachineBasicBlock &MBB,
+                                     const MachineRegisterInfo &MRI,
+                                     const MachineDominatorTree &MDT) {
+  for (const MachineOperand &Use : MRI.use_operands(Reg)) {
+    if (&Use == &OneUse)
+      continue;
+    const MachineInstr *UseInst = Use.getParent();
+    const MachineInstr *OneUseInst = OneUse.getParent();
+    if (UseInst->getOpcode() == TargetOpcode::PHI) {
+      // Test that the PHI use, which happens on the CFG edge rather than
+      // within the PHI's own block, is dominated by the one selected use.
+      const MachineBasicBlock *Pred =
+          UseInst->getOperand(&Use - &UseInst->getOperand(0) + 1).getMBB();
+      if (!MDT.dominates(&MBB, Pred))
+        return false;
+    } else if (UseInst == OneUseInst) {
+      // Another use in the same instruction. We need to ensure that the one
+      // selected use happens "before" it.
+      if (&OneUse > &Use)
+        return false;
+    } else {
+      // Test that the use is dominated by the one selected use.
+      if (!MDT.dominates(OneUseInst, UseInst))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Get the appropriate tee_local opcode for the given register class.
+static unsigned GetTeeLocalOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::TEE_LOCAL_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::TEE_LOCAL_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::TEE_LOCAL_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::TEE_LOCAL_F64;
+  llvm_unreachable("Unexpected register class");
+}
+
+/// A single-use def in the same block with no intervening memory or register
+/// dependencies; move the def down and nest it with the current instruction.
+static MachineInstr *MoveForSingleUse(unsigned Reg, MachineInstr *Def,
+                                      MachineBasicBlock &MBB,
+                                      MachineInstr *Insert, LiveIntervals &LIS,
+                                      WebAssemblyFunctionInfo &MFI) {
+  MBB.splice(Insert, &MBB, Def);
+  LIS.handleMove(Def);
+  MFI.stackifyVReg(Reg);
+  ImposeStackOrdering(Def);
+  return Def;
+}
+
+/// A trivially cloneable instruction; clone it and nest the new copy with the
+/// current instruction.
+static MachineInstr *
+RematerializeCheapDef(unsigned Reg, MachineOperand &Op, MachineInstr *Def,
+                      MachineBasicBlock &MBB, MachineInstr *Insert,
+                      LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
+                      MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII,
+                      const WebAssemblyRegisterInfo *TRI) {
+  unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+  TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
+  Op.setReg(NewReg);
+  MachineInstr *Clone = &*std::prev(MachineBasicBlock::instr_iterator(Insert));
+  LIS.InsertMachineInstrInMaps(Clone);
+  LIS.createAndComputeVirtRegInterval(NewReg);
+  MFI.stackifyVReg(NewReg);
+  ImposeStackOrdering(Clone);
+
+  // If that was the last use of the original, delete the original.
+  // Otherwise shrink the LiveInterval.
+  if (MRI.use_empty(Reg)) {
+    SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
+    LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
+    LIS.removeVRegDefAt(LIS.getInterval(Reg), Idx);
+    LIS.removeInterval(Reg);
+    LIS.RemoveMachineInstrFromMaps(Def);
+    Def->eraseFromParent();
+  } else {
+    LIS.shrinkToUses(&LIS.getInterval(Reg));
+  }
+  return Clone;
+}
+
+/// A multiple-use def in the same block with no intervening memory or register
+/// dependencies; move the def down, nest it with the current instruction, and
+/// insert a tee_local to satisfy the rest of the uses. As an illustration,
+/// rewrite this:
+///
+///    Reg = INST ...        // Def
+///    INST ..., Reg, ...    // Insert
+///    INST ..., Reg, ...
+///    INST ..., Reg, ...
+///
+/// to this:
+///
+///    Reg = INST ...        // Def (to become the new Insert)
+///    TeeReg, NewReg = TEE_LOCAL_... Reg
+///    INST ..., TeeReg, ... // Insert
+///    INST ..., NewReg, ...
+///    INST ..., NewReg, ...
+///
+/// with Reg and TeeReg stackified. This eliminates a get_local from the
+/// resulting code.
+static MachineInstr *MoveAndTeeForMultiUse(
+    unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB,
+    MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
+    MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) {
+  MBB.splice(Insert, &MBB, Def);
+  LIS.handleMove(Def);
+  const auto *RegClass = MRI.getRegClass(Reg);
+  unsigned NewReg = MRI.createVirtualRegister(RegClass);
+  unsigned TeeReg = MRI.createVirtualRegister(RegClass);
+  MRI.replaceRegWith(Reg, NewReg);
+  MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(),
+                              TII->get(GetTeeLocalOpcode(RegClass)), TeeReg)
+                          .addReg(NewReg, RegState::Define)
+                          .addReg(Reg);
+  Op.setReg(TeeReg);
+  Def->getOperand(0).setReg(Reg);
+  LIS.InsertMachineInstrInMaps(Tee);
+  LIS.shrinkToUses(&LIS.getInterval(Reg));
+  LIS.createAndComputeVirtRegInterval(NewReg);
+  LIS.createAndComputeVirtRegInterval(TeeReg);
+  MFI.stackifyVReg(Reg);
+  MFI.stackifyVReg(TeeReg);
+  ImposeStackOrdering(Def);
+  ImposeStackOrdering(Tee);
+  return Def;
+}
+
+namespace {
+/// A stack for walking the tree of instructions being built, visiting the
+/// MachineOperands in DFS order.
+class TreeWalkerState {
+  typedef MachineInstr::mop_iterator mop_iterator;
+  typedef std::reverse_iterator<mop_iterator> mop_reverse_iterator;
+  typedef iterator_range<mop_reverse_iterator> RangeTy;
+  SmallVector<RangeTy, 4> Worklist;
+
+public:
+  explicit TreeWalkerState(MachineInstr *Insert) {
+    const iterator_range<mop_iterator> &Range = Insert->explicit_uses();
+    if (Range.begin() != Range.end())
+      Worklist.push_back(reverse(Range));
+  }
+
+  bool Done() const { return Worklist.empty(); }
+
+  MachineOperand &Pop() {
+    RangeTy &Range = Worklist.back();
+    MachineOperand &Op = *Range.begin();
+    Range = drop_begin(Range, 1);
+    if (Range.begin() == Range.end())
+      Worklist.pop_back();
+    assert((Worklist.empty() ||
+            Worklist.back().begin() != Worklist.back().end()) &&
+           "Empty ranges shouldn't remain in the worklist");
+    return Op;
+  }
+
+  /// Push Instr's operands onto the stack to be visited.
+  void PushOperands(MachineInstr *Instr) {
+    const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
+    if (Range.begin() != Range.end())
+      Worklist.push_back(reverse(Range));
+  }
+
+  /// Some of Instr's operands are on the top of the stack; remove them and
+  /// re-insert them starting from the beginning (because we've commuted them).
+  void ResetTopOperands(MachineInstr *Instr) {
+    assert(HasRemainingOperands(Instr) &&
+           "Reseting operands should only be done when the instruction has "
+           "an operand still on the stack");
+    Worklist.back() = reverse(Instr->explicit_uses());
+  }
+
+  /// Test whether Instr has operands remaining to be visited at the top of
+  /// the stack.
+  bool HasRemainingOperands(const MachineInstr *Instr) const {
+    if (Worklist.empty())
+      return false;
+    const RangeTy &Range = Worklist.back();
+    return Range.begin() != Range.end() && Range.begin()->getParent() == Instr;
+  }
+};
+
+/// State to keep track of whether commuting is in flight or whether it's been
+/// tried for the current instruction and didn't work.
+class CommutingState {
+  /// There are effectively three states: the initial state where we haven't
+  /// started commuting anything and we don't know anything yet, the tenative
+  /// state where we've commuted the operands of the current instruction and are
+  /// revisting it, and the declined state where we've reverted the operands
+  /// back to their original order and will no longer commute it further.
+  bool TentativelyCommuting;
+  bool Declined;
+
+  /// During the tentative state, these hold the operand indices of the commuted
+  /// operands.
+  unsigned Operand0, Operand1;
+
+public:
+  CommutingState() : TentativelyCommuting(false), Declined(false) {}
+
+  /// Stackification for an operand was not successful due to ordering
+  /// constraints. If possible, and if we haven't already tried it and declined
+  /// it, commute Insert's operands and prepare to revisit it.
+  void MaybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker,
+                    const WebAssemblyInstrInfo *TII) {
+    if (TentativelyCommuting) {
+      assert(!Declined &&
+             "Don't decline commuting until you've finished trying it");
+      // Commuting didn't help. Revert it.
+      TII->commuteInstruction(Insert, /*NewMI=*/false, Operand0, Operand1);
+      TentativelyCommuting = false;
+      Declined = true;
+    } else if (!Declined && TreeWalker.HasRemainingOperands(Insert)) {
+      Operand0 = TargetInstrInfo::CommuteAnyOperandIndex;
+      Operand1 = TargetInstrInfo::CommuteAnyOperandIndex;
+      if (TII->findCommutedOpIndices(Insert, Operand0, Operand1)) {
+        // Tentatively commute the operands and try again.
+        TII->commuteInstruction(Insert, /*NewMI=*/false, Operand0, Operand1);
+        TreeWalker.ResetTopOperands(Insert);
+        TentativelyCommuting = true;
+        Declined = false;
+      }
+    }
+  }
+
+  /// Stackification for some operand was successful. Reset to the default
+  /// state.
+  void Reset() {
+    TentativelyCommuting = false;
+    Declined = false;
+  }
+};
+} // end anonymous namespace
+
 bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** Register Stackifying **********\n"
                   "********** Function: "
@@ -144,6 +391,7 @@ bool WebAssemblyRegStackify::runOnMachin
   const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
   LiveIntervals &LIS = getAnalysis<LiveIntervals>();
 
   // Walk the instructions from the bottom up. Currently we don't look past
@@ -165,19 +413,35 @@ bool WebAssemblyRegStackify::runOnMachin
 
       // Iterate through the inputs in reverse order, since we'll be pulling
       // operands off the stack in LIFO order.
-      bool AnyStackified = false;
-      for (MachineOperand &Op : reverse(Insert->uses())) {
+      CommutingState Commuting;
+      TreeWalkerState TreeWalker(Insert);
+      while (!TreeWalker.Done()) {
+        MachineOperand &Op = TreeWalker.Pop();
+
         // We're only interested in explicit virtual register operands.
-        if (!Op.isReg() || Op.isImplicit() || !Op.isUse())
+        if (!Op.isReg())
           continue;
 
         unsigned Reg = Op.getReg();
+        assert(Op.isUse() && "explicit_uses() should only iterate over uses");
+        assert(!Op.isImplicit() &&
+               "explicit_uses() should only iterate over explicit operands");
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          continue;
 
-        // Only consider registers with a single definition.
-        // TODO: Eventually we may relax this, to stackify phi transfers.
+        // Identify the definition for this register at this point. Most
+        // registers are in SSA form here so we try a quick MRI query first.
         MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
-        if (!Def)
-          continue;
+        if (!Def) {
+          // MRI doesn't know what the Def is. Try asking LIS.
+          const VNInfo *ValNo = LIS.getInterval(Reg).getVNInfoBefore(
+              LIS.getInstructionIndex(Insert));
+          if (!ValNo)
+            continue;
+          Def = LIS.getInstructionFromIndex(ValNo->def);
+          if (!Def)
+            continue;
+        }
 
         // Don't nest an INLINE_ASM def into anything, because we don't have
         // constraints for $pop outputs.
@@ -196,59 +460,52 @@ bool WebAssemblyRegStackify::runOnMachin
             Def->getOpcode() == WebAssembly::ARGUMENT_F64)
           continue;
 
-        if (MRI.hasOneUse(Reg) && Def->getParent() == &MBB &&
-            IsSafeToMove(Def, Insert, AA, LIS, MRI)) {
-          // A single-use def in the same block with no intervening memory or
-          // register dependencies; move the def down and nest it with the
-          // current instruction.
-          // TODO: Stackify multiple-use values, taking advantage of set_local
-          // returning its result.
-          Changed = true;
-          AnyStackified = true;
-          MBB.splice(Insert, &MBB, Def);
-          LIS.handleMove(Def);
-          MFI.stackifyVReg(Reg);
-          ImposeStackOrdering(Def);
-          Insert = Def;
+        // Decide which strategy to take. Prefer to move a single-use value
+        // over cloning it, and prefer cloning over introducing a tee_local.
+        // For moving, we require the def to be in the same block as the use;
+        // this makes things simpler (LiveIntervals' handleMove function only
+        // supports intra-block moves) and it's MachineSink's job to catch all
+        // the sinking opportunities anyway.
+        bool SameBlock = Def->getParent() == &MBB;
+        bool CanMove = SameBlock && IsSafeToMove(Def, Insert, AA, LIS, MRI);
+        if (CanMove && MRI.hasOneUse(Reg)) {
+          Insert = MoveForSingleUse(Reg, Def, MBB, Insert, LIS, MFI);
         } else if (Def->isAsCheapAsAMove() &&
                    TII->isTriviallyReMaterializable(Def, &AA)) {
-          // A trivially cloneable instruction; clone it and nest the new copy
-          // with the current instruction.
-          Changed = true;
-          AnyStackified = true;
-          unsigned OldReg = Def->getOperand(0).getReg();
-          unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
-          TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
-          Op.setReg(NewReg);
-          MachineInstr *Clone =
-              &*std::prev(MachineBasicBlock::instr_iterator(Insert));
-          LIS.InsertMachineInstrInMaps(Clone);
-          LIS.createAndComputeVirtRegInterval(NewReg);
-          MFI.stackifyVReg(NewReg);
-          ImposeStackOrdering(Clone);
-          Insert = Clone;
-
-          // If that was the last use of the original, delete the original.
-          // Otherwise shrink the LiveInterval.
-          if (MRI.use_empty(OldReg)) {
-            SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
-            LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
-            LIS.removeVRegDefAt(LIS.getInterval(OldReg), Idx);
-            LIS.removeInterval(OldReg);
-            LIS.RemoveMachineInstrFromMaps(Def);
-            Def->eraseFromParent();
-          } else {
-            LIS.shrinkToUses(&LIS.getInterval(OldReg));
-          }
+          Insert = RematerializeCheapDef(Reg, Op, Def, MBB, Insert, LIS, MFI,
+                                         MRI, TII, TRI);
+        } else if (CanMove &&
+                   OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT)) {
+          Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
+                                         MRI, TII);
+        } else {
+          // We failed to stackify the operand. If the problem was ordering
+          // constraints, Commuting may be able to help.
+          if (!CanMove && SameBlock)
+            Commuting.MaybeCommute(Insert, TreeWalker, TII);
+          // Proceed to the next operand.
+          continue;
         }
+
+        // We stackified an operand. Add the defining instruction's operands to
+        // the worklist stack now to continue to build an ever deeper tree.
+        Commuting.Reset();
+        TreeWalker.PushOperands(Insert);
       }
-      if (AnyStackified)
+
+      // If we stackified any operands, skip over the tree to start looking for
+      // the next instruction we can build a tree on.
+      if (Insert != &*MII) {
         ImposeStackOrdering(&*MII);
+        MII = std::prev(
+            make_reverse_iterator(MachineBasicBlock::iterator(Insert)));
+        Changed = true;
+      }
     }
   }
 
-  // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere
-  // so that it never looks like a use-before-def.
+  // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere so
+  // that it never looks like a use-before-def.
   if (Changed) {
     MF.getRegInfo().addLiveIn(WebAssembly::EXPR_STACK);
     for (MachineBasicBlock &MBB : MF)
@@ -263,23 +520,25 @@ bool WebAssemblyRegStackify::runOnMachin
       for (MachineOperand &MO : reverse(MI.explicit_operands())) {
         if (!MO.isReg())
           continue;
-        unsigned VReg = MO.getReg();
+        unsigned Reg = MO.getReg();
 
         // Don't stackify physregs like SP or FP.
-        if (!TargetRegisterInfo::isVirtualRegister(VReg))
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
           continue;
 
-        if (MFI.isVRegStackified(VReg)) {
+        if (MFI.isVRegStackified(Reg)) {
           if (MO.isDef())
-            Stack.push_back(VReg);
+            Stack.push_back(Reg);
           else
-            assert(Stack.pop_back_val() == VReg);
+            assert(Stack.pop_back_val() == Reg &&
+                   "Register stack pop should be paired with a push");
         }
       }
     }
     // TODO: Generalize this code to support keeping values on the stack across
     // basic block boundaries.
-    assert(Stack.empty());
+    assert(Stack.empty() &&
+           "Register stack pushes and pops should be balanced");
   }
 #endif
 

Modified: llvm/trunk/test/CodeGen/WebAssembly/cfg-stackify.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/WebAssembly/cfg-stackify.ll?rev=259009&r1=259008&r2=259009&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/WebAssembly/cfg-stackify.ll (original)
+++ llvm/trunk/test/CodeGen/WebAssembly/cfg-stackify.ll Wed Jan 27 19:22:44 2016
@@ -979,7 +979,7 @@ bb6:
 ; OPT-LABEL: test11:
 ; OPT:       block{{$}}
 ; OPT-NEXT:  block{{$}}
-; OPT:       br_if        $0, 0{{$}}
+; OPT:       br_if        $pop{{[0-9]+}}, 0{{$}}
 ; OPT-NEXT:  block{{$}}
 ; OPT-NOT:   block
 ; OPT:       br_if        $0, 0{{$}}
@@ -1121,31 +1121,33 @@ bb7:
 ; CHECK-LABEL: test13:
 ; CHECK-NEXT:  .local i32{{$}}
 ; CHECK:       block{{$}}
-; CHECK:       br_if $pop4, 0{{$}}
+; CHECK:       br_if $pop5, 0{{$}}
 ; CHECK-NEXT:  return{{$}}
 ; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:  end_block{{$}}
 ; CHECK:       block{{$}}
-; CHECK-NEXT:  br_if $0, 0{{$}}
+; CHECK-NEXT:  i32.const $push3=, 0{{$}}
+; CHECK-NEXT:  br_if $pop3, 0{{$}}
 ; CHECK:       .LBB22_4:
 ; CHECK-NEXT:  end_block{{$}}
 ; CHECK:       block{{$}}
-; CHECK:       br_if $pop6, 0{{$}}
+; CHECK:       br_if $pop7, 0{{$}}
 ; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NEXT:  unreachable{{$}}
 ; OPT-LABEL: test13:
 ; OPT-NEXT:  .local i32{{$}}
 ; OPT:       block{{$}}
-; OPT:       br_if $pop4, 0{{$}}
+; OPT:       br_if $pop5, 0{{$}}
 ; OPT-NEXT:  return{{$}}
 ; OPT-NEXT:  .LBB22_2:
 ; OPT-NEXT:  end_block{{$}}
 ; OPT:       block{{$}}
-; OPT-NEXT:  br_if $0, 0{{$}}
+; OPT-NEXT:  i32.const $push3=, 0{{$}}
+; OPT-NEXT:  br_if $pop3, 0{{$}}
 ; OPT:       .LBB22_4:
 ; OPT-NEXT:  end_block{{$}}
 ; OPT:       block{{$}}
-; OPT:       br_if $pop6, 0{{$}}
+; OPT:       br_if $pop7, 0{{$}}
 ; OPT-NEXT:  end_block{{$}}
 ; OPT-NEXT:  unreachable{{$}}
 define void @test13() noinline optnone {

Modified: llvm/trunk/test/CodeGen/WebAssembly/reg-stackify.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/WebAssembly/reg-stackify.ll?rev=259009&r1=259008&r2=259009&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/WebAssembly/reg-stackify.ll (original)
+++ llvm/trunk/test/CodeGen/WebAssembly/reg-stackify.ll Wed Jan 27 19:22:44 2016
@@ -90,17 +90,18 @@ false:
 }
 
 ; Test an interesting case where the load has multiple uses and cannot
-; be trivially stackified.
+; be trivially stackified. However, it can be stackified with a tee_local.
 
 ; CHECK-LABEL: multiple_uses:
 ; CHECK-NEXT: .param       i32, i32, i32{{$}}
 ; CHECK-NEXT: .local       i32{{$}}
-; CHECK-NEXT: i32.load    $3=, 0($2){{$}}
 ; CHECK-NEXT: block{{$}}
-; CHECK-NEXT: i32.ge_u    $push0=, $3, $1{{$}}
-; CHECK-NEXT: br_if       $pop0, 0{{$}}
-; CHECK-NEXT: i32.lt_u    $push1=, $3, $0{{$}}
+; CHECK-NEXT: i32.load    $push0=, 0($2){{$}}
+; CHECK-NEXT: tee_local   $push3=, $3=, $pop0{{$}}
+; CHECK-NEXT: i32.ge_u    $push1=, $pop3, $1{{$}}
 ; CHECK-NEXT: br_if       $pop1, 0{{$}}
+; CHECK-NEXT: i32.lt_u    $push2=, $3, $0{{$}}
+; CHECK-NEXT: br_if       $pop2, 0{{$}}
 ; CHECK-NEXT: i32.store   $discard=, 0($2), $3{{$}}
 ; CHECK-NEXT: .LBB5_3:
 ; CHECK-NEXT: end_block{{$}}
@@ -145,4 +146,102 @@ entry:
   ret void
 }
 
+; Div instructions have side effects and can't be reordered, but this entire
+; function should still be able to be stackified because it's already in
+; tree order.
+
+; CHECK-LABEL: div_tree:
+; CHECK-NEXT: .param i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32{{$}}
+; CHECK-NEXT: .result     i32{{$}}
+; CHECK-NEXT: i32.div_s   $push0=, $0, $1
+; CHECK-NEXT: i32.div_s   $push1=, $2, $3
+; CHECK-NEXT: i32.div_s   $push2=, $pop0, $pop1
+; CHECK-NEXT: i32.div_s   $push3=, $4, $5
+; CHECK-NEXT: i32.div_s   $push4=, $6, $7
+; CHECK-NEXT: i32.div_s   $push5=, $pop3, $pop4
+; CHECK-NEXT: i32.div_s   $push6=, $pop2, $pop5
+; CHECK-NEXT: i32.div_s   $push7=, $8, $9
+; CHECK-NEXT: i32.div_s   $push8=, $10, $11
+; CHECK-NEXT: i32.div_s   $push9=, $pop7, $pop8
+; CHECK-NEXT: i32.div_s   $push10=, $12, $13
+; CHECK-NEXT: i32.div_s   $push11=, $14, $15
+; CHECK-NEXT: i32.div_s   $push12=, $pop10, $pop11
+; CHECK-NEXT: i32.div_s   $push13=, $pop9, $pop12
+; CHECK-NEXT: i32.div_s   $push14=, $pop6, $pop13
+; CHECK-NEXT: return      $pop14
+define i32 @div_tree(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) {
+entry:
+  %div = sdiv i32 %a, %b
+  %div1 = sdiv i32 %c, %d
+  %div2 = sdiv i32 %div, %div1
+  %div3 = sdiv i32 %e, %f
+  %div4 = sdiv i32 %g, %h
+  %div5 = sdiv i32 %div3, %div4
+  %div6 = sdiv i32 %div2, %div5
+  %div7 = sdiv i32 %i, %j
+  %div8 = sdiv i32 %k, %l
+  %div9 = sdiv i32 %div7, %div8
+  %div10 = sdiv i32 %m, %n
+  %div11 = sdiv i32 %o, %p
+  %div12 = sdiv i32 %div10, %div11
+  %div13 = sdiv i32 %div9, %div12
+  %div14 = sdiv i32 %div6, %div13
+  ret i32 %div14
+}
+
+; A simple multiple-use case.
+
+; CHECK-LABEL: simple_multiple_use:
+; CHECK-NEXT:  .param      i32, i32{{$}}
+; CHECK-NEXT:  i32.mul     $push0=, $1, $0{{$}}
+; CHECK-NEXT:  tee_local   $push1=, $0=, $pop0{{$}}
+; CHECK-NEXT:  call        use_a at FUNCTION, $pop1{{$}}
+; CHECK-NEXT:  call        use_b at FUNCTION, $0{{$}}
+; CHECK-NEXT:  return{{$}}
+declare void @use_a(i32)
+declare void @use_b(i32)
+define void @simple_multiple_use(i32 %x, i32 %y) {
+  %mul = mul i32 %y, %x
+  call void @use_a(i32 %mul)
+  call void @use_b(i32 %mul)
+  ret void
+}
+
+; Multiple uses of the same value in one instruction.
+
+; CHECK-LABEL: multiple_uses_in_same_insn:
+; CHECK-NEXT:  .param      i32, i32{{$}}
+; CHECK-NEXT:  i32.mul     $push0=, $1, $0{{$}}
+; CHECK-NEXT:  tee_local   $push1=, $0=, $pop0{{$}}
+; CHECK-NEXT:  call        use_2 at FUNCTION, $pop1, $0{{$}}
+; CHECK-NEXT:  return{{$}}
+declare void @use_2(i32, i32)
+define void @multiple_uses_in_same_insn(i32 %x, i32 %y) {
+  %mul = mul i32 %y, %x
+  call void @use_2(i32 %mul, i32 %mul)
+  ret void
+}
+
+; Commute operands to achieve better stackifying.
+
+; CHECK-LABEL: commute:
+; CHECK-NEXT:  .result     i32{{$}}
+; CHECK-NEXT:  i32.call    $push0=, red at FUNCTION{{$}}
+; CHECK-NEXT:  i32.call    $push1=, green at FUNCTION{{$}}
+; CHECK-NEXT:  i32.add     $push2=, $pop0, $pop1{{$}}
+; CHECK-NEXT:  i32.call    $push3=, blue at FUNCTION{{$}}
+; CHECK-NEXT:  i32.add     $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT:  return      $pop4{{$}}
+declare i32 @red()
+declare i32 @green()
+declare i32 @blue()
+define i32 @commute() {
+  %call = call i32 @red()
+  %call1 = call i32 @green()
+  %add = add i32 %call1, %call
+  %call2 = call i32 @blue()
+  %add3 = add i32 %add, %call2
+  ret i32 %add3
+}
+
 !0 = !{}

Modified: llvm/trunk/test/CodeGen/WebAssembly/varargs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/WebAssembly/varargs.ll?rev=259009&r1=259008&r2=259009&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/WebAssembly/varargs.ll (original)
+++ llvm/trunk/test/CodeGen/WebAssembly/varargs.ll Wed Jan 27 19:22:44 2016
@@ -49,12 +49,13 @@ entry:
 ; CHECK-NEXT: .param     i32{{$}}
 ; CHECK-NEXT: .result    i32{{$}}
 ; CHECK-NEXT: .local     i32{{$}}
-; CHECK-NEXT: i32.load   $1=, 0($0){{$}}
-; CHECK-NEXT: i32.const  $push0=, 4{{$}}
-; CHECK-NEXT: i32.add    $push1=, $1, $pop0{{$}}
-; CHECK-NEXT: i32.store  $discard=, 0($0), $pop1{{$}}
-; CHECK-NEXT: i32.load   $push2=, 0($1){{$}}
-; CHECK-NEXT: return     $pop2{{$}}
+; CHECK-NEXT: i32.load   $push0=, 0($0){{$}}
+; CHECK-NEXT: tee_local  $push4=, $1=, $pop0{{$}}
+; CHECK-NEXT: i32.const  $push1=, 4{{$}}
+; CHECK-NEXT: i32.add    $push2=, $pop4, $pop1{{$}}
+; CHECK-NEXT: i32.store  $discard=, 0($0), $pop2{{$}}
+; CHECK-NEXT: i32.load   $push3=, 0($1){{$}}
+; CHECK-NEXT: return     $pop3{{$}}
 define i8 @arg_i8(i8** %ap) {
 entry:
   %t = va_arg i8** %ap, i8
@@ -71,12 +72,13 @@ entry:
 ; CHECK-NEXT: i32.const  $push1=, 3{{$}}
 ; CHECK-NEXT: i32.add    $push2=, $pop0, $pop1{{$}}
 ; CHECK-NEXT: i32.const  $push3=, -4{{$}}
-; CHECK-NEXT: i32.and    $1=, $pop2, $pop3{{$}}
-; CHECK-NEXT: i32.const  $push4=, 4{{$}}
-; CHECK-NEXT: i32.add    $push5=, $1, $pop4{{$}}
-; CHECK-NEXT: i32.store  $discard=, 0($0), $pop5{{$}}
-; CHECK-NEXT: i32.load   $push6=, 0($1){{$}}
-; CHECK-NEXT: return     $pop6{{$}}
+; CHECK-NEXT: i32.and    $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: tee_local  $push8=, $1=, $pop4{{$}}
+; CHECK-NEXT: i32.const  $push5=, 4{{$}}
+; CHECK-NEXT: i32.add    $push6=, $pop8, $pop5{{$}}
+; CHECK-NEXT: i32.store  $discard=, 0($0), $pop6{{$}}
+; CHECK-NEXT: i32.load   $push7=, 0($1){{$}}
+; CHECK-NEXT: return     $pop7{{$}}
 define i32 @arg_i32(i8** %ap) {
 entry:
   %t = va_arg i8** %ap, i32




More information about the llvm-commits mailing list