[llvm] r330264 - [x86] Fix PR37100 by teaching the EFLAGS copy lowering to rewrite uses

Wed Apr 18 08:13:16 PDT 2018

Author: chandlerc
Date: Wed Apr 18 08:13:16 2018
New Revision: 330264

URL: http://llvm.org/viewvc/llvm-project?rev=330264&view=rev
Log:
[x86] Fix PR37100 by teaching the EFLAGS copy lowering to rewrite uses
across basic blocks in the limited cases where it is very straight
forward to do so.

This will also be useful for other places where we do some limited
EFLAGS propagation across CFG edges and need to handle copy rewrites
afterward. I think this is rapidly approaching the maximum we can and
should be doing here. Everything else begins to require either heroic
analysis to prove how to do PHI insertion manually, or somehow managing
arbitrary PHI-ing of EFLAGS with general PHI insertion. Neither of these
seem at all promising so if those cases come up, we'll almost certainly
need to rewrite the parts of LLVM that produce those patterns.

We do now require dominator trees in order to reliably diagnose patterns
that would require PHI nodes. This is a bit unfortunate but it seems
better than the completely mysterious crash we would get otherwise.

Differential Revision: https://reviews.llvm.org/D45673

Modified:
    llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp
    llvm/trunk/test/CodeGen/X86/O0-pipeline.ll
    llvm/trunk/test/CodeGen/X86/O3-pipeline.ll
    llvm/trunk/test/CodeGen/X86/copy-eflags.ll

Modified: llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp?rev=330264&r1=330263&r2=330264&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86FlagsCopyLowering.cpp Wed Apr 18 08:13:16 2018
@@ -36,6 +36,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -98,6 +99,7 @@ private:
   const X86InstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const TargetRegisterClass *PromoteRC;
+  MachineDominatorTree *MDT;
 
   CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
                                   MachineInstr &CopyDefI);
@@ -145,6 +147,7 @@ FunctionPass *llvm::createX86FlagsCopyLo
 char X86FlagsCopyLoweringPass::ID = 0;
 
 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -342,6 +345,7 @@ bool X86FlagsCopyLoweringPass::runOnMach
   MRI = &MF.getRegInfo();
   TII = Subtarget.getInstrInfo();
   TRI = Subtarget.getRegisterInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
   PromoteRC = &X86::GR8RegClass;
 
   if (MF.begin() == MF.end())
@@ -416,103 +420,142 @@ bool X86FlagsCopyLoweringPass::runOnMach
     // of these up front instead.
     CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
 
-    for (auto MII = std::next(CopyI->getIterator()), MIE = MBB.instr_end();
-         MII != MIE;) {
-      MachineInstr &MI = *MII++;
-      MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
-      if (!FlagUse) {
-        if (MI.findRegisterDefOperand(X86::EFLAGS)) {
-          // If EFLAGS are defined, it's as-if they were killed. We can stop
-          // scanning here.
-          //
-          // NB!!! Many instructions only modify some flags. LLVM currently
-          // models this as clobbering all flags, but if that ever changes this
-          // will need to be carefully updated to handle that more complex
-          // logic.
-          FlagsKilled = true;
-          break;
-        }
-        continue;
-      }
-
-      DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
+    // Collect the basic blocks we need to scan. Typically this will just be
+    // a single basic block but we may have to scan multiple blocks if the
+    // EFLAGS copy lives into successors.
+    SmallVector<MachineBasicBlock *, 2> Blocks;
+    SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
+    Blocks.push_back(&MBB);
+    VisitedBlocks.insert(&MBB);
 
-      // Check the kill flag before we rewrite as that may change it.
-      if (FlagUse->isKill())
-        FlagsKilled = true;
+    do {
+      MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
 
-      // Once we encounter a branch, the rest of the instructions must also be
-      // branches. We can't rewrite in place here, so we handle them below.
+      // We currently don't do any PHI insertion and so we require that the
+      // test basic block dominates all of the use basic blocks.
       //
-      // Note that we don't have to handle tail calls here, even conditional
-      // tail calls, as those are not introduced into the X86 MI until post-RA
-      // branch folding or black placement. As a consequence, we get to deal
-      // with the simpler formulation of conditional branches followed by tail
-      // calls.
-      if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
-        auto JmpIt = MI.getIterator();
-        do {
-          JmpIs.push_back(&*JmpIt);
-          ++JmpIt;
-        } while (JmpIt != MBB.instr_end() &&
-                 X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
-                     X86::COND_INVALID);
-        break;
+      // We could in theory do PHI insertion here if it becomes useful by just
+      // taking undef values in along every edge that we don't trace this
+      // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
+      // but still seems like a great deal of complexity.
+      //
+      // Because it is theoretically possible that some earlier MI pass or
+      // other lowering transformation could induce this to happen, we do
+      // a hard check even in non-debug builds here.
+      if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
+        DEBUG({
+          dbgs() << "ERROR: Encountered use that is not dominated by our test "
+                    "basic block! Rewriting this would require inserting PHI "
+                    "nodes to track the flag state across the CFG.\n\nTest "
+                    "block:\n";
+          TestMBB.dump();
+          dbgs() << "Use block:\n";
+          UseMBB.dump();
+        });
+        report_fatal_error("Cannot lower EFLAGS copy when original copy def "
+                           "does not dominate all uses.");
       }
 
-      // Otherwise we can just rewrite in-place.
-      if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
-        rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-      } else if (X86::getCondFromSETOpc(MI.getOpcode()) != X86::COND_INVALID) {
-        rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-      } else if (MI.getOpcode() == TargetOpcode::COPY) {
-        rewriteCopy(MI, *FlagUse, CopyDefI);
-      } else {
-        // We assume that arithmetic instructions that use flags also def them.
-        assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
-               "Expected a def of EFLAGS for this instruction!");
-
-        // NB!!! Several arithmetic instructions only *partially* update
-        // flags. Theoretically, we could generate MI code sequences that
-        // would rely on this fact and observe different flags independently.
-        // But currently LLVM models all of these instructions as clobbering
-        // all the flags in an undef way. We rely on that to simplify the
-        // logic.
-        FlagsKilled = true;
+      for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
+                                      : UseMBB.instr_begin(),
+                MIE = UseMBB.instr_end();
+           MII != MIE;) {
+        MachineInstr &MI = *MII++;
+        MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
+        if (!FlagUse) {
+          if (MI.findRegisterDefOperand(X86::EFLAGS)) {
+            // If EFLAGS are defined, it's as-if they were killed. We can stop
+            // scanning here.
+            //
+            // NB!!! Many instructions only modify some flags. LLVM currently
+            // models this as clobbering all flags, but if that ever changes
+            // this will need to be carefully updated to handle that more
+            // complex logic.
+            FlagsKilled = true;
+            break;
+          }
+          continue;
+        }
 
-        rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-        break;
+        DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
+
+        // Check the kill flag before we rewrite as that may change it.
+        if (FlagUse->isKill())
+          FlagsKilled = true;
+
+        // Once we encounter a branch, the rest of the instructions must also be
+        // branches. We can't rewrite in place here, so we handle them below.
+        //
+        // Note that we don't have to handle tail calls here, even conditional
+        // tail calls, as those are not introduced into the X86 MI until post-RA
+        // branch folding or black placement. As a consequence, we get to deal
+        // with the simpler formulation of conditional branches followed by tail
+        // calls.
+        if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
+          auto JmpIt = MI.getIterator();
+          do {
+            JmpIs.push_back(&*JmpIt);
+            ++JmpIt;
+          } while (JmpIt != UseMBB.instr_end() &&
+                   X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
+                       X86::COND_INVALID);
+          break;
+        }
+
+        // Otherwise we can just rewrite in-place.
+        if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
+          rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+        } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
+                   X86::COND_INVALID) {
+          rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+        } else if (MI.getOpcode() == TargetOpcode::COPY) {
+          rewriteCopy(MI, *FlagUse, CopyDefI);
+        } else {
+          // We assume that arithmetic instructions that use flags also def
+          // them.
+          assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
+                 "Expected a def of EFLAGS for this instruction!");
+
+          // NB!!! Several arithmetic instructions only *partially* update
+          // flags. Theoretically, we could generate MI code sequences that
+          // would rely on this fact and observe different flags independently.
+          // But currently LLVM models all of these instructions as clobbering
+          // all the flags in an undef way. We rely on that to simplify the
+          // logic.
+          FlagsKilled = true;
+
+          rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+          break;
+        }
+
+        // If this was the last use of the flags, we're done.
+        if (FlagsKilled)
+          break;
       }
 
-      // If this was the last use of the flags, we're done.
+      // If the flags were killed, we're done with this block.
       if (FlagsKilled)
         break;
-    }
 
-    // If we didn't find a kill (or equivalent) check that the flags don't
-    // live-out of the basic block. Currently we don't support lowering copies
-    // of flags that live out in this fashion.
-    if (!FlagsKilled &&
-        llvm::any_of(MBB.successors(), [](MachineBasicBlock *SuccMBB) {
-          return SuccMBB->isLiveIn(X86::EFLAGS);
-        })) {
-      DEBUG({
-        dbgs() << "ERROR: Found a copied EFLAGS live-out from basic block:\n"
-               << "----\n";
-        MBB.dump();
-        dbgs() << "----\n"
-               << "ERROR: Cannot lower this EFLAGS copy!\n";
-      });
-      report_fatal_error(
-          "Cannot lower EFLAGS copy that lives out of a basic block!");
-    }
+      // Otherwise we need to scan successors for ones where the flags live-in
+      // and queue those up for processing.
+      for (MachineBasicBlock *SuccMBB : UseMBB.successors())
+        if (SuccMBB->isLiveIn(X86::EFLAGS) &&
+            VisitedBlocks.insert(SuccMBB).second)
+          Blocks.push_back(SuccMBB);
+    } while (!Blocks.empty());
 
     // Now rewrite the jumps that use the flags. These we handle specially
-    // because if there are multiple jumps we'll have to do surgery on the CFG.
+    // because if there are multiple jumps in a single basic block we'll have
+    // to do surgery on the CFG.
+    MachineBasicBlock *LastJmpMBB = nullptr;
     for (MachineInstr *JmpI : JmpIs) {
-      // Past the first jump we need to split the blocks apart.
-      if (JmpI != JmpIs.front())
+      // Past the first jump within a basic block we need to split the blocks
+      // apart.
+      if (JmpI->getParent() == LastJmpMBB)
         splitBlock(*JmpI->getParent(), *JmpI, *TII);
+      else
+        LastJmpMBB = JmpI->getParent();
 
       rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
     }

Modified: llvm/trunk/test/CodeGen/X86/O0-pipeline.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/O0-pipeline.ll?rev=330264&r1=330263&r2=330264&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/O0-pipeline.ll (original)
+++ llvm/trunk/test/CodeGen/X86/O0-pipeline.ll Wed Apr 18 08:13:16 2018
@@ -37,6 +37,7 @@
 ; CHECK-NEXT:       X86 PIC Global Base Reg Initialization
 ; CHECK-NEXT:       Expand ISel Pseudo-instructions
 ; CHECK-NEXT:       Local Stack Slot Allocation
+; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 WinAlloca Expander
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation

Modified: llvm/trunk/test/CodeGen/X86/O3-pipeline.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/O3-pipeline.ll?rev=330264&r1=330263&r2=330264&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/O3-pipeline.ll (original)
+++ llvm/trunk/test/CodeGen/X86/O3-pipeline.ll Wed Apr 18 08:13:16 2018
@@ -90,6 +90,7 @@
 ; CHECK-NEXT:       X86 LEA Optimize
 ; CHECK-NEXT:       X86 Optimize Call Frame
 ; CHECK-NEXT:       X86 Avoid Store Forwarding Block
+; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 WinAlloca Expander
 ; CHECK-NEXT:       Detect Dead Lanes

Modified: llvm/trunk/test/CodeGen/X86/copy-eflags.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/copy-eflags.ll?rev=330264&r1=330263&r2=330264&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/copy-eflags.ll (original)
+++ llvm/trunk/test/CodeGen/X86/copy-eflags.ll Wed Apr 18 08:13:16 2018
@@ -196,3 +196,111 @@ else:
   tail call void @external_b()
   ret void
 }
+
+; Test a function that gets special select lowering into CFG with copied EFLAGS
+; threaded across the CFG. This requires our EFLAGS copy rewriting to handle
+; cross-block rewrites in at least some narrow cases.
+define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %ptr2) {
+; X32-LABEL: PR37100:
+; X32:       # %bb.0: # %bb
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT:    jmp .LBB3_1
+; X32-NEXT:    .p2align 4, 0x90
+; X32-NEXT:  .LBB3_5: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    idivl %ebp
+; X32-NEXT:  .LBB3_1: # %bb1
+; X32-NEXT:    # =>This Inner Loop Header: Depth=1
+; X32-NEXT:    movsbl %cl, %eax
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    sarl $31, %edx
+; X32-NEXT:    cmpl %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    sbbl %edx, %eax
+; X32-NEXT:    setl %al
+; X32-NEXT:    setl %dl
+; X32-NEXT:    movzbl %dl, %ebp
+; X32-NEXT:    negl %ebp
+; X32-NEXT:    testb $-1, %al
+; X32-NEXT:    jne .LBB3_3
+; X32-NEXT:  # %bb.2: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    movb %ch, %cl
+; X32-NEXT:  .LBB3_3: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    movb %cl, (%ebx)
+; X32-NEXT:    movl (%edi), %edx
+; X32-NEXT:    testb $-1, %al
+; X32-NEXT:    jne .LBB3_5
+; X32-NEXT:  # %bb.4: # %bb1
+; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    jmp .LBB3_5
+;
+; X64-LABEL: PR37100:
+; X64:       # %bb.0: # %bb
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    jmp .LBB3_1
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB3_5: # %bb1
+; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    idivl %esi
+; X64-NEXT:  .LBB3_1: # %bb1
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movsbq %dil, %rax
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    cmpq %rax, %r10
+; X64-NEXT:    setl %sil
+; X64-NEXT:    negl %esi
+; X64-NEXT:    cmpq %rax, %r10
+; X64-NEXT:    jl .LBB3_3
+; X64-NEXT:  # %bb.2: # %bb1
+; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    movl %ecx, %edi
+; X64-NEXT:  .LBB3_3: # %bb1
+; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    movb %dil, (%r8)
+; X64-NEXT:    jl .LBB3_5
+; X64-NEXT:  # %bb.4: # %bb1
+; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    movl (%r9), %esi
+; X64-NEXT:    jmp .LBB3_5
+bb:
+  br label %bb1
+
+bb1:
+  %tmp = phi i8 [ %tmp8, %bb1 ], [ %arg1, %bb ]
+  %tmp2 = phi i16 [ %tmp12, %bb1 ], [ %arg2, %bb ]
+  %tmp3 = icmp sgt i16 %tmp2, 7
+  %tmp4 = select i1 %tmp3, i16 %tmp2, i16 7
+  %tmp5 = sext i8 %tmp to i64
+  %tmp6 = icmp slt i64 %arg3, %tmp5
+  %tmp7 = sext i1 %tmp6 to i32
+  %tmp8 = select i1 %tmp6, i8 %tmp, i8 %arg4
+  store volatile i8 %tmp8, i8* %ptr1
+  %tmp9 = load volatile i32, i32* %ptr2
+  %tmp10 = select i1 %tmp6, i32 %tmp7, i32 %tmp9
+  %tmp11 = srem i32 0, %tmp10
+  %tmp12 = trunc i32 %tmp11 to i16
+  br label %bb1
+}