[llvm] r265450 - [X86] Reuse EFLAGS and form LOCKed ops when only user is SETCC.

Tue Apr 5 13:02:57 PDT 2016

Author: ab
Date: Tue Apr  5 15:02:57 2016
New Revision: 265450

URL: http://llvm.org/viewvc/llvm-project?rev=265450&view=rev
Log:
[X86] Reuse EFLAGS and form LOCKed ops when only user is SETCC.

We only generate LOCKed versions of add/sub when the result is unused.
It often happens that the result is used, but only by a comparison. We
can optimize those out by reusing EFLAGS, which lets us use the proper
instructions, instead of having to fallback to LXADD.

Instead of doing this as an MI peephole (as we do for the other
non-LOCKed (really, non-MR) forms), do it in ISel. It becomes quite
tricky later.

This also makes it eventually possible to stop expanding and/or/xor
if the only user is an icmp (also see D18141).

This uses the LOCK ISD opcodes added by r262244.

Differential Revision: http://reviews.llvm.org/D17633

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/atomic-eflags-reuse.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=265450&r1=265449&r2=265450&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Apr  5 15:02:57 2016
@@ -26134,6 +26134,56 @@ static SDValue combineSelect(SDNode *N,
   return SDValue();
 }
 
+/// Combine:
+///   (brcond/cmov/setcc .., (cmp (atomic_load_op ..), 0), cc)
+/// to:
+///   (brcond/cmov/setcc .., (LOCKed op ..), cc)
+/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+/// Note that this is only legal for some op/cc combinations.
+static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode CC,
+                                       SelectionDAG &DAG) {
+  // This combine only operates on CMP-like nodes.
+  if (!(Cmp.getOpcode() == X86ISD::CMP ||
+        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+    return SDValue();
+
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+
+  if (!LHS.hasOneUse())
+    return SDValue();
+
+  // FIXME: We can do this for XOR/OR/AND as well, but only if they survive
+  // AtomicExpand. Currently, we choose to expand them to cmpxchg if they
+  // have any users. Could we relax that to ignore (icmp x,0) users?
+  switch (LHS->getOpcode()) {
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+    break;
+  default:
+    return SDValue();
+  }
+
+  auto *C = dyn_cast<ConstantSDNode>(RHS);
+  if (!C || C->getZExtValue() != 0)
+    return SDValue();
+
+  // Don't do this for all condition codes, as OF/CF are cleared by (CMP x,0)
+  // but might be set by arithmetic. Furthermore, we might later select INC/DEC,
+  // which don't modify CF (though CCs using CF should have been optimized out).
+  // SF/ZF are safe as they are set the same way.
+  // Note that in theory, the transformation is also valid for P/NP.
+  if (CC != X86::COND_E && CC != X86::COND_NE && CC != X86::COND_S &&
+      CC != X86::COND_NS)
+    return SDValue();
+
+  SDValue LockOp = lowerAtomicArithWithLOCK(LHS, DAG);
+  DAG.ReplaceAllUsesOfValueWith(LHS.getValue(0),
+                                DAG.getUNDEF(LHS.getValueType()));
+  DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LockOp.getValue(1));
+  return LockOp;
+}
+
 // Check whether a boolean test is testing a boolean value generated by
 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
 // code.
@@ -26305,6 +26355,16 @@ static bool checkBoolTestAndOrSetCCCombi
   return true;
 }
 
+/// Optimize an EFLAGS definition used according to the condition code \p CC
+/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
+/// uses of chain values.
+static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
+                                  SelectionDAG &DAG) {
+  if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
+    return R;
+  return combineSetCCAtomicArith(EFLAGS, CC, DAG);
+}
+
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
@@ -26331,15 +26391,14 @@ static SDValue combineCMov(SDNode *N, Se
     }
   }
 
-  SDValue Flags;
-
-  Flags = checkBoolTestSetCCCombine(Cond, CC);
-  if (Flags.getNode() &&
-      // Extra check as FCMOV only supports a subset of X86 cond.
-      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
-    SDValue Ops[] = { FalseOp, TrueOp,
-                      DAG.getConstant(CC, DL, MVT::i8), Flags };
-    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+  // Try to simplify the EFLAGS and condition code operands.
+  // We can't always do this as FCMOV only supports a subset of X86 cond.
+  if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
+    if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
+      SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
+        Flags};
+      return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+    }
   }
 
   // If this is a select between two integer constants, try to do some
@@ -29265,7 +29324,8 @@ static SDValue combineX86SetCC(SDNode *N
   if (CC == X86::COND_B)
     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
 
-  if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+  // Try to simplify the EFLAGS and condition code operands.
+  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
   }
@@ -29278,15 +29338,16 @@ static SDValue combineBrCond(SDNode *N,
                              TargetLowering::DAGCombinerInfo &DCI,
                              const X86Subtarget &Subtarget) {
   SDLoc DL(N);
-  SDValue Chain = N->getOperand(0);
-  SDValue Dest = N->getOperand(1);
   SDValue EFLAGS = N->getOperand(3);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
 
-  if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+  // Try to simplify the EFLAGS and condition code operands.
+  // Make sure to not keep references to operands, as combineSetCCEFLAGS can
+  // RAUW them under us.
+  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
-    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
-                       Flags);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
+                       N->getOperand(1), Cond, Flags);
   }
 
   return SDValue();

Modified: llvm/trunk/test/CodeGen/X86/atomic-eflags-reuse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/atomic-eflags-reuse.ll?rev=265450&r1=265449&r2=265450&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/atomic-eflags-reuse.ll (original)
+++ llvm/trunk/test/CodeGen/X86/atomic-eflags-reuse.ll Tue Apr  5 15:02:57 2016
@@ -4,9 +4,7 @@
 define i8 @test_add_1_setcc_ne(i64* %p) #0 {
 ; CHECK-LABEL: test_add_1_setcc_ne:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    lock incq (%rdi)
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    retq
 entry:
@@ -19,9 +17,7 @@ entry:
 define i8 @test_sub_1_setcc_eq(i64* %p) #0 {
 ; CHECK-LABEL: test_sub_1_setcc_eq:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movq $-1, %rax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    lock decq (%rdi)
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
 entry:
@@ -49,9 +45,7 @@ entry:
 define i8 @test_sub_10_setcc_sge(i64* %p) #0 {
 ; CHECK-LABEL: test_sub_10_setcc_sge:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movq $-10, %rax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    lock addq $-10, (%rdi)
 ; CHECK-NEXT:    setns %al
 ; CHECK-NEXT:    retq
 entry:
@@ -66,9 +60,7 @@ entry:
 define i32 @test_add_10_brcond_sge(i64* %p, i32 %a0, i32 %a1) #0 {
 ; CHECK-LABEL: test_add_10_brcond_sge:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movl $10, %eax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    lock addq $10, (%rdi)
 ; CHECK-NEXT:    js .LBB4_2
 ; CHECK-NEXT:  # BB#1: # %t
 ; CHECK-NEXT:    movl %esi, %eax
@@ -89,9 +81,7 @@ f:
 define i32 @test_sub_1_cmov_slt(i64* %p, i32 %a0, i32 %a1) #0 {
 ; CHECK-LABEL: test_sub_1_cmov_slt:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movq $-1, %rax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    lock decq (%rdi)
 ; CHECK-NEXT:    cmovnsl %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq