[llvm] r311806 - [x86] Teach the backend to fold more read-modify-write memory operands

Fri Aug 25 15:50:52 PDT 2017

Author: chandlerc
Date: Fri Aug 25 15:50:52 2017
New Revision: 311806

URL: http://llvm.org/viewvc/llvm-project?rev=311806&view=rev
Log:
[x86] Teach the backend to fold more read-modify-write memory operands
to instructions.

These can't be reasonably matched in tablegen due to the handling of
flags, so we have to do this in C++ code. We only did it for `inc` and
`dec` historically, this starts fleshing that out to more interesting
instructions. Notably, this handles transfering operands to `add` and
`sub`.

Currently this forces them into a register. The next patch will add
support for keeping immediate operands as immediates. Then I'll extend
this beyond just `add` and `sub`.

I'm not super thrilled by the repeated switches in the code but
everything else I tried was really ugly or problematic.

Many thanks to Craig Topper for the suggestions about where to even
begin here and how to make this stuff work.

Differential Revision: https://reviews.llvm.org/D37130

Added:
    llvm/trunk/test/CodeGen/X86/fold-rmw-ops.ll
Modified:
    llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/trunk/test/CodeGen/X86/add.ll
    llvm/trunk/test/CodeGen/X86/addcarry.ll
    llvm/trunk/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
    llvm/trunk/test/CodeGen/X86/pr32659.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp?rev=311806&r1=311805&r2=311806&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp Fri Aug 25 15:50:52 2017
@@ -1932,42 +1932,6 @@ static bool hasNoSignedComparisonUses(SD
   return true;
 }
 
-/// Get the appropriate X86 opcode for an in-memory arithmetic operation that
-/// also sets flags.
-///
-/// FIXME: This is essentially re-implemneting a subset of the patterns for
-/// these instructions. Instead, we should compute this from the patterns
-/// somehow.
-///
-/// FIXME: Currently we only support integer operations.
-///
-/// If there is no X86 opcode, returns none.
-static Optional<unsigned> getFusedLdStWithFlagsOpcode(EVT LdVT, unsigned Opc) {
-  auto SelectSize = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
-                        unsigned Opc8) -> Optional<unsigned> {
-    switch (LdVT.getSimpleVT().SimpleTy) {
-    case MVT::i64:
-      return Opc64;
-    case MVT::i32:
-      return Opc32;
-    case MVT::i16:
-      return Opc16;
-    case MVT::i8:
-      return Opc8;
-    default:
-      return None;
-    }
-  };
-  switch (Opc) {
-  default:
-    return None;
-  case X86ISD::DEC:
-    return SelectSize(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
-  case X86ISD::INC:
-    return SelectSize(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m);
-  }
-}
-
 /// Check whether or not the chain ending in StoreNode is suitable for doing
 /// the {load; op; store} to modify transformation.
 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
@@ -2047,15 +2011,16 @@ static bool isFusableLoadOpStorePattern(
   return true;
 }
 
-// Change a chain of {load; incr or dec; store} of the same value into
-// a simple increment or decrement through memory of that value, if the
-// uses of the modified value and its address are suitable.
-// The DEC64m tablegen pattern is currently not able to match the case where
-// the EFLAGS on the original DEC are used. (This also applies to
-// {INC,DEC}X{64,32,16,8}.)
-// We'll need to improve tablegen to allow flags to be transferred from a
-// node in the pattern to the result node.  probably with a new keyword
-// for example, we have this
+// Change a chain of {load; op; store} of the same value into a simple op
+// through memory of that value, if the uses of the modified value and its
+// address are suitable.
+//
+// The tablegen pattern memory operand pattern is currently not able to match
+// the case where the EFLAGS on the original operation are used.
+//
+// To move this to tablegen, we'll need to improve tablegen to allow flags to
+// be transferred from a node in the pattern to the result node, probably with
+// a new keyword. For example, we have this
 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
 //   (implicit EFLAGS)]>;
@@ -2064,19 +2029,29 @@ static bool isFusableLoadOpStorePattern(
 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
 //   (transferrable EFLAGS)]>;
 //
-// FIXME: This should handle a wide range of operations which support RMW
-// memory operands, not just inc and dec.
+// Until then, we manually fold these and instruction select the operation
+// here.
 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
   StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
   SDValue StoredVal = StoreNode->getOperand(1);
   unsigned Opc = StoredVal->getOpcode();
 
+  // Before we try to select anything, make sure this is memory operand size
+  // and opcode we can handle. Note that this must match the code below that
+  // actually lowers the opcodes.
   EVT MemVT = StoreNode->getMemoryVT();
-  if (!MemVT.isSimple())
+  if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
+      MemVT != MVT::i8)
     return false;
-  Optional<unsigned> NewOpc = getFusedLdStWithFlagsOpcode(MemVT, Opc);
-  if (!NewOpc)
+  switch (Opc) {
+  default:
     return false;
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+    break;
+  }
 
   LoadSDNode *LoadNode = nullptr;
   SDValue InputChain;
@@ -2089,12 +2064,57 @@ bool X86DAGToDAGISel::foldLoadStoreIntoM
                   Segment))
     return false;
 
+  auto SelectOpcodeForSize = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
+                                 unsigned Opc8) {
+    switch (MemVT.getSimpleVT().SimpleTy) {
+    case MVT::i64:
+      return Opc64;
+    case MVT::i32:
+      return Opc32;
+    case MVT::i16:
+      return Opc16;
+    case MVT::i8:
+      return Opc8;
+    default:
+      llvm_unreachable("Invalid size!");
+    }
+  };
+
+  MachineSDNode *Result;
+  switch (Opc) {
+  case X86ISD::INC:
+  case X86ISD::DEC: {
+    unsigned NewOpc = Opc == X86ISD::INC
+                          ? SelectOpcodeForSize(X86::INC64m, X86::INC32m,
+                                                X86::INC16m, X86::INC8m)
+                          : SelectOpcodeForSize(X86::DEC64m, X86::DEC32m,
+                                                X86::DEC16m, X86::DEC8m);
+    const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
+    Result =
+        CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
+    break;
+  }
+  case X86ISD::ADD:
+  case X86ISD::SUB: {
+    unsigned NewOpc = Opc == X86ISD::ADD
+                          ? SelectOpcodeForSize(X86::ADD64mr, X86::ADD32mr,
+                                                X86::ADD16mr, X86::ADD8mr)
+                          : SelectOpcodeForSize(X86::SUB64mr, X86::SUB32mr,
+                                                X86::SUB16mr, X86::SUB8mr);
+    const SDValue Ops[] = {Base,      Scale,   Index,
+                           Disp,      Segment, StoredVal->getOperand(1),
+                           InputChain};
+    Result =
+        CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
+    break;
+  }
+  default:
+    llvm_unreachable("Invalid opcode!");
+  }
+
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
   MemOp[0] = StoreNode->getMemOperand();
   MemOp[1] = LoadNode->getMemOperand();
-  const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
-  MachineSDNode *Result =
-      CurDAG->getMachineNode(*NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
   Result->setMemRefs(MemOp, MemOp + 2);
 
   ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));

Modified: llvm/trunk/test/CodeGen/X86/add.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/add.ll?rev=311806&r1=311805&r2=311806&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/add.ll (original)
+++ llvm/trunk/test/CodeGen/X86/add.ll Fri Aug 25 15:50:52 2017
@@ -341,9 +341,8 @@ define void @test12(i64* inreg %a) nounw
 ; X32-LABEL: test12:
 ; X32:       # BB#0: # %entry
 ; X32-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X32-NEXT:    addl (%eax), %ecx
+; X32-NEXT:    addl %ecx, (%eax)
 ; X32-NEXT:    adcl $0, 4(%eax)
-; X32-NEXT:    movl %ecx, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LINUX-LABEL: test12:
@@ -366,9 +365,8 @@ define void @test13(i64* inreg %a) nounw
 ; X32-LABEL: test13:
 ; X32:       # BB#0: # %entry
 ; X32-NEXT:    movl $128, %ecx
-; X32-NEXT:    addl (%eax), %ecx
+; X32-NEXT:    addl %ecx, (%eax)
 ; X32-NEXT:    adcl $0, 4(%eax)
-; X32-NEXT:    movl %ecx, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LINUX-LABEL: test13:

Modified: llvm/trunk/test/CodeGen/X86/addcarry.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/addcarry.ll?rev=311806&r1=311805&r2=311806&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/addcarry.ll (original)
+++ llvm/trunk/test/CodeGen/X86/addcarry.ll Fri Aug 25 15:50:52 2017
@@ -171,8 +171,7 @@ define void @muladd(%accumulator* nocapt
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    mulq %rsi
-; CHECK-NEXT:    addq (%rdi), %rax
-; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    addq %rax, (%rdi)
 ; CHECK-NEXT:    adcq 8(%rdi), %rdx
 ; CHECK-NEXT:    movq %rdx, 8(%rdi)
 ; CHECK-NEXT:    adcl $0, 16(%rdi)

Added: llvm/trunk/test/CodeGen/X86/fold-rmw-ops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fold-rmw-ops.ll?rev=311806&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fold-rmw-ops.ll (added)
+++ llvm/trunk/test/CodeGen/X86/fold-rmw-ops.ll Fri Aug 25 15:50:52 2017
@@ -0,0 +1,420 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+
+target triple = "x86_64-unknown-unknown"
+
+ at g64 = external global i64, align 8
+ at g32 = external global i32, align 4
+ at g16 = external global i16, align 2
+ at g8 = external global i8, align 1
+
+declare void @a()
+declare void @b()
+
+define void @add64_imm_br() nounwind {
+; CHECK-LABEL: add64_imm_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    addq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB0_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB0_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i64, i64* @g64
+  %add = add nsw i64 %load1, 42
+  store i64 %add, i64* @g64
+  %cond = icmp slt i64 %add, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @add32_imm_br() nounwind {
+; CHECK-LABEL: add32_imm_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    addl %eax, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB1_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB1_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i32, i32* @g32
+  %add = add nsw i32 %load1, 42
+  store i32 %add, i32* @g32
+  %cond = icmp slt i32 %add, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @add16_imm_br() nounwind {
+; CHECK-LABEL: add16_imm_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movw $42, %ax
+; CHECK-NEXT:    addw %ax, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB2_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB2_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i16, i16* @g16
+  %add = add nsw i16 %load1, 42
+  store i16 %add, i16* @g16
+  %cond = icmp slt i16 %add, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @add8_imm_br() nounwind {
+; CHECK-LABEL: add8_imm_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movb $42, %al
+; CHECK-NEXT:    addb %al, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB3_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB3_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i8, i8* @g8
+  %add = add nsw i8 %load1, 42
+  store i8 %add, i8* @g8
+  %cond = icmp slt i8 %add, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @add64_reg_br(i64 %arg) nounwind {
+; CHECK-LABEL: add64_reg_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addq %rdi, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB4_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB4_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i64, i64* @g64
+  %add = add nsw i64 %load1, %arg
+  store i64 %add, i64* @g64
+  %cond = icmp slt i64 %add, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @add32_reg_br(i32 %arg) nounwind {
+; CHECK-LABEL: add32_reg_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addl %edi, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB5_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB5_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i32, i32* @g32
+  %add = add nsw i32 %load1, %arg
+  store i32 %add, i32* @g32
+  %cond = icmp slt i32 %add, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @add16_reg_br(i16 %arg) nounwind {
+; CHECK-LABEL: add16_reg_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addw %di, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB6_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB6_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i16, i16* @g16
+  %add = add nsw i16 %load1, %arg
+  store i16 %add, i16* @g16
+  %cond = icmp slt i16 %add, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @add8_reg_br(i8 %arg) nounwind {
+; CHECK-LABEL: add8_reg_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addb %dil, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB7_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB7_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i8, i8* @g8
+  %add = add nsw i8 %load1, %arg
+  store i8 %add, i8* @g8
+  %cond = icmp slt i8 %add, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @sub64_imm_br() nounwind {
+; CHECK-LABEL: sub64_imm_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movq $-42, %rax
+; CHECK-NEXT:    addq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB8_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB8_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i64, i64* @g64
+  %sub = sub nsw i64 %load1, 42
+  store i64 %sub, i64* @g64
+  %cond = icmp slt i64 %sub, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @sub32_imm_br() nounwind {
+; CHECK-LABEL: sub32_imm_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl $-42, %eax
+; CHECK-NEXT:    addl %eax, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB9_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB9_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i32, i32* @g32
+  %sub = sub nsw i32 %load1, 42
+  store i32 %sub, i32* @g32
+  %cond = icmp slt i32 %sub, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @sub16_imm_br() nounwind {
+; CHECK-LABEL: sub16_imm_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movw $-42, %ax
+; CHECK-NEXT:    addw %ax, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB10_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB10_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i16, i16* @g16
+  %sub = sub nsw i16 %load1, 42
+  store i16 %sub, i16* @g16
+  %cond = icmp slt i16 %sub, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @sub8_imm_br() nounwind {
+; CHECK-LABEL: sub8_imm_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movb $-42, %al
+; CHECK-NEXT:    addb %al, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB11_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB11_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i8, i8* @g8
+  %sub = sub nsw i8 %load1, 42
+  store i8 %sub, i8* @g8
+  %cond = icmp slt i8 %sub, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @sub64_reg_br(i64 %arg) nounwind {
+; CHECK-LABEL: sub64_reg_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subq %rdi, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB12_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB12_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i64, i64* @g64
+  %sub = sub nsw i64 %load1, %arg
+  store i64 %sub, i64* @g64
+  %cond = icmp slt i64 %sub, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @sub32_reg_br(i32 %arg) nounwind {
+; CHECK-LABEL: sub32_reg_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subl %edi, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB13_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB13_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i32, i32* @g32
+  %sub = sub nsw i32 %load1, %arg
+  store i32 %sub, i32* @g32
+  %cond = icmp slt i32 %sub, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @sub16_reg_br(i16 %arg) nounwind {
+; CHECK-LABEL: sub16_reg_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subw %di, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB14_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB14_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i16, i16* @g16
+  %sub = sub nsw i16 %load1, %arg
+  store i16 %sub, i16* @g16
+  %cond = icmp slt i16 %sub, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}
+
+define void @sub8_reg_br(i8 %arg) nounwind {
+; CHECK-LABEL: sub8_reg_br:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subb %dil, {{.*}}(%rip)
+; CHECK-NEXT:    js .LBB15_1
+; CHECK-NEXT:  # BB#2: # %b
+; CHECK-NEXT:    jmp b # TAILCALL
+; CHECK-NEXT:  .LBB15_1: # %a
+; CHECK-NEXT:    jmp a # TAILCALL
+entry:
+  %load1 = load i8, i8* @g8
+  %sub = sub nsw i8 %load1, %arg
+  store i8 %sub, i8* @g8
+  %cond = icmp slt i8 %sub, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  tail call void @a()
+  ret void
+
+b:
+  tail call void @b()
+  ret void
+}

Modified: llvm/trunk/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/peephole-na-phys-copy-folding.ll?rev=311806&r1=311805&r2=311806&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/peephole-na-phys-copy-folding.ll (original)
+++ llvm/trunk/test/CodeGen/X86/peephole-na-phys-copy-folding.ll Fri Aug 25 15:50:52 2017
@@ -65,10 +65,9 @@ exit2:
 define i1 @plus_forty_two() nounwind {
 ; CHECK32-LABEL: plus_forty_two:
 ; CHECK32:       # BB#0: # %entry
-; CHECK32-NEXT:    movl L, %ecx
 ; CHECK32-NEXT:    movb M, %al
-; CHECK32-NEXT:    addl $42, %ecx
-; CHECK32-NEXT:    movl %ecx, L
+; CHECK32-NEXT:    movl $42, %ecx
+; CHECK32-NEXT:    addl %ecx, L
 ; CHECK32-NEXT:    jne .LBB1_2
 ; CHECK32-NEXT:  # BB#1: # %entry
 ; CHECK32-NEXT:    andb $8, %al
@@ -82,10 +81,9 @@ define i1 @plus_forty_two() nounwind {
 ;
 ; CHECK64-LABEL: plus_forty_two:
 ; CHECK64:       # BB#0: # %entry
-; CHECK64-NEXT:    movl {{.*}}(%rip), %ecx
 ; CHECK64-NEXT:    movb {{.*}}(%rip), %al
-; CHECK64-NEXT:    addl $42, %ecx
-; CHECK64-NEXT:    movl %ecx, {{.*}}(%rip)
+; CHECK64-NEXT:    movl $42, %ecx
+; CHECK64-NEXT:    addl %ecx, {{.*}}(%rip)
 ; CHECK64-NEXT:    jne .LBB1_2
 ; CHECK64-NEXT:  # BB#1: # %entry
 ; CHECK64-NEXT:    andb $8, %al
@@ -165,10 +163,9 @@ exit2:
 define i1 @minus_forty_two() nounwind {
 ; CHECK32-LABEL: minus_forty_two:
 ; CHECK32:       # BB#0: # %entry
-; CHECK32-NEXT:    movl L, %ecx
 ; CHECK32-NEXT:    movb M, %al
-; CHECK32-NEXT:    addl $-42, %ecx
-; CHECK32-NEXT:    movl %ecx, L
+; CHECK32-NEXT:    movl $-42, %ecx
+; CHECK32-NEXT:    addl %ecx, L
 ; CHECK32-NEXT:    jne .LBB3_2
 ; CHECK32-NEXT:  # BB#1: # %entry
 ; CHECK32-NEXT:    andb $8, %al
@@ -182,10 +179,9 @@ define i1 @minus_forty_two() nounwind {
 ;
 ; CHECK64-LABEL: minus_forty_two:
 ; CHECK64:       # BB#0: # %entry
-; CHECK64-NEXT:    movl {{.*}}(%rip), %ecx
 ; CHECK64-NEXT:    movb {{.*}}(%rip), %al
-; CHECK64-NEXT:    addl $-42, %ecx
-; CHECK64-NEXT:    movl %ecx, {{.*}}(%rip)
+; CHECK64-NEXT:    movl $-42, %ecx
+; CHECK64-NEXT:    addl %ecx, {{.*}}(%rip)
 ; CHECK64-NEXT:    jne .LBB3_2
 ; CHECK64-NEXT:  # BB#1: # %entry
 ; CHECK64-NEXT:    andb $8, %al

Modified: llvm/trunk/test/CodeGen/X86/pr32659.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr32659.ll?rev=311806&r1=311805&r2=311806&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr32659.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr32659.ll Fri Aug 25 15:50:52 2017
@@ -50,10 +50,10 @@ define void @fn2() nounwind optsize {
 ; CHECK-NEXT:    sarl $31, %eax
 ; CHECK-NEXT:    andl %eax, e+4
 ; CHECK-NEXT:    decl g
-; CHECK-NEXT:    movl f, %eax
-; CHECK-NEXT:    addl $1, %eax
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    incl %eax
+; CHECK-NEXT:    addl %eax, f
 ; CHECK-NEXT:    adcl $0, f+4
-; CHECK-NEXT:    movl %eax, f
 ; CHECK-NEXT:    addl $8, %esp
 ; CHECK-NEXT:    popl %ebx
 ; CHECK-NEXT:    retl