[llvm] r336985 - [x86] Teach the EFLAGS copy lowering to handle much more complex control

Fri Jul 13 02:39:10 PDT 2018

Author: chandlerc
Date: Fri Jul 13 02:39:10 2018
New Revision: 336985

URL: http://llvm.org/viewvc/llvm-project?rev=336985&view=rev
Log:
[x86] Teach the EFLAGS copy lowering to handle much more complex control
flow patterns including forks, merges, and even cyles.

This tries to cover a reasonably comprehensive set of patterns that
still don't require PHIs or PHI placement. The coverage was inspired by
the amazing variety of patterns produced when copy EFLAGS and restoring
it to implement Speculative Load Hardening. Without this patch, we
simply cannot make such complex and invasive changes to x86 instruction
sequences due to EFLAGS.

I've added "just" one test, but this test covers many different
complexities and corner cases of this approach. It is actually more
comprehensive, as far as I can tell, than anything that I have
encountered in the wild on SLH.

Because the test is so complex, I've tried to give somewhat thorough
comments and an ASCII-art diagram of the control flows to make it a bit
easier to read and maintain long-term.

Differential Revision: https://reviews.llvm.org/D49220

Modified:
    llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp
    llvm/trunk/test/CodeGen/X86/flags-copy-lowering.mir

Modified: llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp?rev=336985&r1=336984&r2=336985&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp Fri Jul 13 02:39:10 2018
@@ -27,6 +27,7 @@
 #include "X86Subtarget.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -102,7 +103,7 @@ private:
   MachineDominatorTree *MDT;
 
   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
-                                  MachineInstr &CopyDefI);
+                                  MachineBasicBlock::iterator CopyDefI);
 
   unsigned promoteCondToReg(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator TestPos,
@@ -356,9 +357,14 @@ bool X86FlagsCopyLoweringPass::runOnMach
     // Nothing to do for a degenerate empty function...
     return false;
 
+  // Collect the copies in RPO so that when there are chains where a copy is in
+  // turn copied again we visit the first one first. This ensures we can find
+  // viable locations for testing the original EFLAGS that dominate all the
+  // uses across complex CFGs.
   SmallVector<MachineInstr *, 4> Copies;
-  for (MachineBasicBlock &MBB : MF)
-    for (MachineInstr &MI : MBB)
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (MachineBasicBlock *MBB : RPOT)
+    for (MachineInstr &MI : *MBB)
       if (MI.getOpcode() == TargetOpcode::COPY &&
           MI.getOperand(0).getReg() == X86::EFLAGS)
         Copies.push_back(&MI);
@@ -407,12 +413,99 @@ bool X86FlagsCopyLoweringPass::runOnMach
     if (DOp.isDead())
       continue;
 
-    MachineBasicBlock &TestMBB = *CopyDefI.getParent();
+    MachineBasicBlock *TestMBB = CopyDefI.getParent();
     auto TestPos = CopyDefI.getIterator();
     DebugLoc TestLoc = CopyDefI.getDebugLoc();
 
     LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
 
+    // Walk up across live-in EFLAGS to find where they were actually def'ed.
+    //
+    // This copy's def may just be part of a region of blocks covered by
+    // a single def of EFLAGS and we want to find the top of that region where
+    // possible.
+    //
+    // This is essentially a search for a *candidate* reaching definition
+    // location. We don't need to ever find the actual reaching definition here,
+    // but we want to walk up the dominator tree to find the highest point which
+    // would be viable for such a definition.
+    auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
+                                MachineBasicBlock::iterator End) {
+      // Scan backwards as we expect these to be relatively short and often find
+      // a clobber near the end.
+      return llvm::any_of(
+          llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
+            // Flag any instruction (other than the copy we are
+            // currently rewriting) that defs EFLAGS.
+            return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
+          });
+    };
+    auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
+                                    MachineBasicBlock *EndMBB) {
+      assert(MDT->dominates(BeginMBB, EndMBB) &&
+             "Only support paths down the dominator tree!");
+      SmallPtrSet<MachineBasicBlock *, 4> Visited;
+      SmallVector<MachineBasicBlock *, 4> Worklist;
+      // We terminate at the beginning. No need to scan it.
+      Visited.insert(BeginMBB);
+      Worklist.push_back(EndMBB);
+      do {
+        auto *MBB = Worklist.pop_back_val();
+        for (auto *PredMBB : MBB->predecessors()) {
+          if (!Visited.insert(PredMBB).second)
+            continue;
+          if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
+            return true;
+          // Enqueue this block to walk its predecessors.
+          Worklist.push_back(PredMBB);
+        }
+      } while (!Worklist.empty());
+      // No clobber found along a path from the begin to end.
+      return false;
+    };
+    while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
+           !HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
+      // Find the nearest common dominator of the predecessors, as
+      // that will be the best candidate to hoist into.
+      MachineBasicBlock *HoistMBB =
+          std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
+                          *TestMBB->pred_begin(),
+                          [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
+                            return MDT->findNearestCommonDominator(LHS, RHS);
+                          });
+
+      // Now we need to scan all predecessors that may be reached along paths to
+      // the hoist block. A clobber anywhere in any of these blocks the hoist.
+      // Note that this even handles loops because we require *no* clobbers.
+      if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
+        break;
+
+      // We also need the terminators to not sneakily clobber flags.
+      if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
+                           HoistMBB->instr_end()))
+        break;
+
+      // We found a viable location, hoist our test position to it.
+      TestMBB = HoistMBB;
+      TestPos = TestMBB->getFirstTerminator()->getIterator();
+      // Clear the debug location as it would just be confusing after hoisting.
+      TestLoc = DebugLoc();
+    }
+    LLVM_DEBUG({
+      auto DefIt = llvm::find_if(
+          llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
+          [&](MachineInstr &MI) {
+            return MI.findRegisterDefOperand(X86::EFLAGS);
+          });
+      if (DefIt.base() != TestMBB->instr_begin()) {
+        dbgs() << "  Using EFLAGS defined by: ";
+        DefIt->dump();
+      } else {
+        dbgs() << "  Using live-in flags for BB:\n";
+        TestMBB->dump();
+      }
+    });
+
     // While rewriting uses, we buffer jumps and rewrite them in a second pass
     // because doing so will perturb the CFG that we are walking to find the
     // uses in the first place.
@@ -423,7 +516,7 @@ bool X86FlagsCopyLoweringPass::runOnMach
     // very few of them and we expect to not revisit the same copy definition
     // many times. If either of those change sufficiently we could build a map
     // of these up front instead.
-    CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
+    CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
 
     // Collect the basic blocks we need to scan. Typically this will just be
     // a single basic block but we may have to scan multiple blocks if the
@@ -431,7 +524,6 @@ bool X86FlagsCopyLoweringPass::runOnMach
     SmallVector<MachineBasicBlock *, 2> Blocks;
     SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
     Blocks.push_back(&MBB);
-    VisitedBlocks.insert(&MBB);
 
     do {
       MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
@@ -439,36 +531,32 @@ bool X86FlagsCopyLoweringPass::runOnMach
       // Track when if/when we find a kill of the flags in this block.
       bool FlagsKilled = false;
 
-      // We currently don't do any PHI insertion and so we require that the
-      // test basic block dominates all of the use basic blocks.
-      //
-      // We could in theory do PHI insertion here if it becomes useful by just
-      // taking undef values in along every edge that we don't trace this
-      // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
-      // but still seems like a great deal of complexity.
-      //
-      // Because it is theoretically possible that some earlier MI pass or
-      // other lowering transformation could induce this to happen, we do
-      // a hard check even in non-debug builds here.
-      if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
-        LLVM_DEBUG({
-          dbgs() << "ERROR: Encountered use that is not dominated by our test "
-                    "basic block! Rewriting this would require inserting PHI "
-                    "nodes to track the flag state across the CFG.\n\nTest "
-                    "block:\n";
-          TestMBB.dump();
-          dbgs() << "Use block:\n";
-          UseMBB.dump();
-        });
-        report_fatal_error("Cannot lower EFLAGS copy when original copy def "
-                           "does not dominate all uses.");
-      }
-
-      for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
-                                      : UseMBB.instr_begin(),
+      // In most cases, we walk from the beginning to the end of the block. But
+      // when the block is the same block as the copy is from, we will visit it
+      // twice. The first time we start from the copy and go to the end. The
+      // second time we start from the beginning and go to the copy. This lets
+      // us handle copies inside of cycles.
+      // FIXME: This loop is *super* confusing. This is at least in part
+      // a symptom of all of this routine needing to be refactored into
+      // documentable components. Once done, there may be a better way to write
+      // this loop.
+      for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
+                          ? std::next(CopyI->getIterator())
+                          : UseMBB.instr_begin(),
                 MIE = UseMBB.instr_end();
            MII != MIE;) {
         MachineInstr &MI = *MII++;
+        // If we are in the original copy block and encounter either the copy
+        // def or the copy itself, break so that we don't re-process any part of
+        // the block or process the instructions in the range that was copied
+        // over.
+        if (&MI == CopyI || &MI == &CopyDefI) {
+          assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
+                 "Should only encounter these on the second pass over the "
+                 "original block.");
+          break;
+        }
+
         MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
         if (!FlagUse) {
           if (MI.findRegisterDefOperand(X86::EFLAGS)) {
@@ -512,10 +600,10 @@ bool X86FlagsCopyLoweringPass::runOnMach
 
         // Otherwise we can just rewrite in-place.
         if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
-          rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+          rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
         } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
                    X86::COND_INVALID) {
-          rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+          rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
         } else if (MI.getOpcode() == TargetOpcode::COPY) {
           rewriteCopy(MI, *FlagUse, CopyDefI);
         } else {
@@ -538,13 +626,13 @@ bool X86FlagsCopyLoweringPass::runOnMach
           case X86::SETB_C64r:
             // Use custom lowering for arithmetic that is merely extending the
             // carry flag. We model this as the SETB_C* pseudo instructions.
-            rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+            rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
                                     CondRegs);
             break;
 
           default:
             // Generically handle remaining uses as arithmetic instructions.
-            rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+            rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
                               CondRegs);
             break;
           }
@@ -564,8 +652,38 @@ bool X86FlagsCopyLoweringPass::runOnMach
       // and queue those up for processing.
       for (MachineBasicBlock *SuccMBB : UseMBB.successors())
         if (SuccMBB->isLiveIn(X86::EFLAGS) &&
-            VisitedBlocks.insert(SuccMBB).second)
+            VisitedBlocks.insert(SuccMBB).second) {
+          // We currently don't do any PHI insertion and so we require that the
+          // test basic block dominates all of the use basic blocks. Further, we
+          // can't have a cycle from the test block back to itself as that would
+          // create a cycle requiring a PHI to break it.
+          //
+          // We could in theory do PHI insertion here if it becomes useful by
+          // just taking undef values in along every edge that we don't trace
+          // this EFLAGS copy along. This isn't as bad as fully general PHI
+          // insertion, but still seems like a great deal of complexity.
+          //
+          // Because it is theoretically possible that some earlier MI pass or
+          // other lowering transformation could induce this to happen, we do
+          // a hard check even in non-debug builds here.
+          if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
+            LLVM_DEBUG({
+              dbgs()
+                  << "ERROR: Encountered use that is not dominated by our test "
+                     "basic block! Rewriting this would require inserting PHI "
+                     "nodes to track the flag state across the CFG.\n\nTest "
+                     "block:\n";
+              TestMBB->dump();
+              dbgs() << "Use block:\n";
+              SuccMBB->dump();
+            });
+            report_fatal_error(
+                "Cannot lower EFLAGS copy when original copy def "
+                "does not dominate all uses.");
+          }
+
           Blocks.push_back(SuccMBB);
+        }
     } while (!Blocks.empty());
 
     // Now rewrite the jumps that use the flags. These we handle specially
@@ -580,7 +698,7 @@ bool X86FlagsCopyLoweringPass::runOnMach
       else
         LastJmpMBB = JmpI->getParent();
 
-      rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
+      rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
     }
 
     // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
@@ -604,14 +722,13 @@ bool X86FlagsCopyLoweringPass::runOnMach
 
 /// Collect any conditions that have already been set in registers so that we
 /// can re-use them rather than adding duplicates.
-CondRegArray
-X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB,
-                                             MachineInstr &CopyDefI) {
+CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
   CondRegArray CondRegs = {};
 
   // Scan backwards across the range of instructions with live EFLAGS.
-  for (MachineInstr &MI : llvm::reverse(
-           llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) {
+  for (MachineInstr &MI :
+       llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
     X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
     if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() &&
         TRI->isVirtualRegister(MI.getOperand(0).getReg()))

Modified: llvm/trunk/test/CodeGen/X86/flags-copy-lowering.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/flags-copy-lowering.mir?rev=336985&r1=336984&r2=336985&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/flags-copy-lowering.mir (original)
+++ llvm/trunk/test/CodeGen/X86/flags-copy-lowering.mir Fri Jul 13 02:39:10 2018
@@ -84,6 +84,12 @@
     call void @foo()
     ret i64 0
   }
+
+  define i64 @test_mid_cycle_copies(i64 %a, i64 %b) {
+  entry:
+    call void @foo()
+    ret i64 0
+  }
 ...
 ---
 name:            test_branch
@@ -738,3 +744,195 @@ body:             |
     RET 0, $rax
 
 ...
+---
+# This test case is designed to exercise a particularly challenging situation:
+# when the flags are copied and restored *inside* of a complex and cyclic CFG
+# all of which have live-in flags. To correctly handle this case we have to walk
+# up the dominator tree and locate a viable reaching definition location,
+# checking for clobbers along any path. The CFG for this function looks like the
+# following diagram, control flowing out the bottom of blocks and in the top:
+#
+#  bb.0
+#   | __________________
+#   |/                  \
+#  bb.1                  |
+#   |\_________          |
+#   | __       \ ____    |
+#   |/  \      |/    \   |
+#  bb.2  |    bb.4    |  |
+#   |\__/     / \     |  |
+#   |        /   \    |  |
+#  bb.3    bb.5  bb.6 |  |
+#   |        \   /    |  |
+#   |         \ /     |  |
+#   |         bb.7    |  |
+#   | ________/ \____/   |
+#   |/                   |
+#  bb.8                  |
+#   |\__________________/
+#   |
+#  bb.9
+#
+# We set EFLAGS in bb.0, clobber them in bb.3, and copy them in bb.2 and bb.6.
+# Because of the cycles this requires hoisting the `SETcc` instructions to
+# capture the flags for the bb.6 copy to bb.1 and using them for the copy in
+# `bb.2` as well despite the clobber in `bb.3`. The clobber in `bb.3` also
+# prevents hoisting the `SETcc`s up to `bb.0`.
+#
+# Throughout the test we use branch instructions that are totally bogus (as the
+# flags are obviously not changing!) but this is just to allow us to send
+# a small but complex CFG structure through the backend and force it to choose
+# plausible lowering decisions based on the core CFG presented, regardless of
+# the futility of the actual branches.
+name:            test_mid_cycle_copies
+# CHECK-LABEL: name: test_mid_cycle_copies
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$rsi', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    successors: %bb.1
+    liveins: $rdi, $rsi
+
+    %0:gr64 = COPY $rdi
+    %1:gr64 = COPY $rsi
+    CMP64rr %0, %1, implicit-def $eflags
+  ; CHECK:      bb.0:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        CMP64rr %0, %1, implicit-def $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+    JMP_1 %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.4
+    liveins: $eflags
+
+    ; Outer loop header, target for one set of hoisting.
+    JE_1 %bb.2, implicit $eflags
+    JMP_1 %bb.4
+  ; CHECK:      bb.1:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
+  ; CHECK-NEXT:   %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
+  ; CHECK-NEXT:   %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+
+  bb.2:
+    successors: %bb.2, %bb.3
+    liveins: $eflags
+
+    ; Inner loop with a local copy. We should eliminate this but can't hoist.
+    %2:gr64 = COPY $eflags
+    $eflags = COPY %2
+    JE_1 %bb.2, implicit $eflags
+    JMP_1 %bb.3
+  ; CHECK:      bb.2:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:   JNE_1 %bb.2, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+
+  bb.3:
+    successors: %bb.8
+    liveins: $eflags
+
+    ; Use and then clobber $eflags. Then hop to the outer loop latch.
+    %3:gr64 = ADC64ri32 %0, 42, implicit-def dead $eflags, implicit $eflags
+  ; CHECK:      bb.3:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        dead %{{[^:]*}}:gr8 = ADD8ri %[[B_REG]], 255, implicit-def $eflags
+  ; CHECK-NEXT:   %3:gr64 = ADC64ri32 %0, 42, implicit-def{{( dead)?}} $eflags, implicit{{( killed)?}} $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3
+    JMP_1 %bb.8
+
+  bb.4:
+    successors: %bb.5, %bb.6
+    liveins: $eflags
+
+    ; Another inner loop, this one with a diamond.
+    JE_1 %bb.5, implicit $eflags
+    JMP_1 %bb.6
+  ; CHECK:      bb.4:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:   JNE_1 %bb.5, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+
+  bb.5:
+    successors: %bb.7
+    liveins: $eflags
+
+    ; Just use $eflags on this side of the diamond.
+    %4:gr64 = CMOVA64rr %0, %1, implicit $eflags
+  ; CHECK:      bb.5:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:         TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
+    JMP_1 %bb.7
+
+  bb.6:
+    successors: %bb.7
+    liveins: $eflags
+
+    ; Use, copy, and then use $eflags again.
+    %5:gr64 = CMOVA64rr %0, %1, implicit $eflags
+  ; CHECK:      bb.6:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+  ; CHECK-NEXT:   %5:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
+
+    %6:gr64 = COPY $eflags
+    $eflags = COPY %6:gr64
+
+    %7:gr64 = CMOVA64rr %0, %1, implicit $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+  ; CHECK-NEXT:   %7:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %7
+    JMP_1 %bb.7
+
+  bb.7:
+    successors: %bb.4, %bb.8
+    liveins: $eflags
+
+    ; Inner loop latch.
+    JE_1 %bb.4, implicit $eflags
+    JMP_1 %bb.8
+  ; CHECK:      bb.7:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:   JNE_1 %bb.4, implicit killed $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+
+  bb.8:
+    successors: %bb.1, %bb.9
+
+    ; Outer loop latch. Note that we cannot have EFLAGS live-in here as that
+    ; Immediately require PHIs.
+    CMP64rr %0, %1, implicit-def $eflags
+    JE_1 %bb.1, implicit $eflags
+    JMP_1 %bb.9
+  ; CHECK:      bb.8:
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+  ; CHECK:        CMP64rr %0, %1, implicit-def $eflags
+  ; CHECK-NEXT:   JE_1 %bb.1, implicit $eflags
+  ; CHECK-NOT:    COPY{{( killed)?}} $eflags
+
+  bb.9:
+    liveins: $eflags
+
+    ; And we're done.
+    %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
+    $rax = COPY %8
+    RET 0, $rax
+  ; CHECK:      bb.9:
+  ; CHECK-NOT:     $eflags
+  ; CHECK:         %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
+
+...