[llvm] r360393 - [X86] Improve lowering of idemptotent RMW operations

Thu May 9 16:23:42 PDT 2019

Author: reames
Date: Thu May  9 16:23:42 2019
New Revision: 360393

URL: http://llvm.org/viewvc/llvm-project?rev=360393&view=rev
Log:
[X86] Improve lowering of idemptotent RMW operations

The current lowering uses an mfence. mfences are substaintially higher latency than the locked operations originally requested, but we do want to avoid contention on the original cache line. As such, use a locked instruction on a cache line assumed to be thread local.

Differential Revision: https://reviews.llvm.org/D58632


Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/atomic-idempotent.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=360393&r1=360392&r2=360393&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu May  9 16:23:42 2019
@@ -25727,6 +25727,14 @@ X86TargetLowering::lowerIdempotentRMWInt
   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
     return nullptr;
 
+  // If this is a canonical idempotent atomicrmw w/no uses, we have a better
+  // lowering available in lowerAtomicArith.
+  // TODO: push more cases through this path. 
+  if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
+    if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
+        AI->use_empty())
+      return nullptr;
+
   auto Builder = IRBuilder<>(AI);
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   auto SSID = AI->getSyncScopeID();
@@ -26223,6 +26231,59 @@ static SDValue LowerBITREVERSE(SDValue O
   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
 }
 
+/// Emit a locked operation on a stack location which does not change any
+/// memory location, but does involve a lock prefix.  Location is chosen to be
+/// a) very likely accessed only by a single thread to minimize cache traffic,
+/// and b) definitely dereferenceable.  Returns the new Chain result.  
+static SDValue emitLockedStackOp(SelectionDAG &DAG,
+                                 const X86Subtarget &Subtarget,
+                                 SDValue Chain, SDLoc DL) {
+  // Implementation notes:
+  // 1) LOCK prefix creates a full read/write reordering barrier for memory
+  // operations issued by the current processor.  As such, the location
+  // referenced is not relevant for the ordering properties of the instruction.
+  // See: IntelÂ® 64 and IA-32 ArchitecturesSoftware Developerâs Manual,
+  // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions 
+  // 2) Using an immediate operand appears to be the best encoding choice
+  // here since it doesn't require an extra register.
+  // 3) OR appears to be very slightly faster than ADD. (Though, the difference
+  // is small enough it might just be measurement noise.)
+  // 4) For the moment, we are using top of stack.  This creates false sharing
+  // with actual stack access/call sequences, and it would be better to use a
+  // location within the redzone.  For the moment, this is still better than an
+  // mfence though.  TODO: Revise the offset used when we can assume a redzone.
+  // 
+  // For a general discussion of the tradeoffs and benchmark results, see:
+  // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
+
+  if (Subtarget.is64Bit()) {
+    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i8);
+    SDValue Ops[] = {
+      DAG.getRegister(X86::RSP, MVT::i64),                  // Base
+      DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
+      DAG.getRegister(0, MVT::i64),                         // Index
+      DAG.getTargetConstant(0, DL, MVT::i32),               // Disp
+      DAG.getRegister(0, MVT::i32),                         // Segment.
+      Zero,
+      Chain};
+    SDNode *Res = DAG.getMachineNode(X86::LOCK_OR32mi8, DL, MVT::Other, Ops);
+    return SDValue(Res, 0);
+  }
+
+  SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+  SDValue Ops[] = {
+    DAG.getRegister(X86::ESP, MVT::i32),            // Base
+    DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
+    DAG.getRegister(0, MVT::i32),                   // Index
+    DAG.getTargetConstant(0, DL, MVT::i32),         // Disp
+    DAG.getRegister(0, MVT::i32),                   // Segment.
+    Zero,
+    Chain
+  };
+  SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::Other, Ops);
+  return SDValue(Res, 0);
+}
+
 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   unsigned NewOpc = 0;
@@ -26257,6 +26318,7 @@ static SDValue lowerAtomicArithWithLOCK(
 /// Lower atomic_load_ops into LOCK-prefixed operations.
 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget) {
+  AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
   SDValue Chain = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
@@ -26271,7 +26333,6 @@ static SDValue lowerAtomicArith(SDValue
     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
     // select LXADD if LOCK_SUB can't be selected.
     if (Opc == ISD::ATOMIC_LOAD_SUB) {
-      AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
                            RHS, AN->getMemOperand());
@@ -26281,6 +26342,32 @@ static SDValue lowerAtomicArith(SDValue
     return N;
   }
 
+  // Specialized lowering for the canonical form of an idemptotent atomicrmw.
+  // The core idea here is that since the memory location isn't actually
+  // changing, all we need is a lowering for the *ordering* impacts of the
+  // atomicrmw.  As such, we can chose a different operation and memory
+  // location to minimize impact on other code.
+  if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
+    // On X86, the only ordering which actually requires an instruction is
+    // seq_cst which isn't SingleThread, everything just needs to be preserved
+    // during codegen and then dropped. Note that we expect (but don't assume),
+    // that orderings other than seq_cst and acq_rel have been canonicalized to
+    // a store or load. 
+    if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
+        AN->getSyncScopeID() == SyncScope::System) {
+      // Prefer a locked operation against a stack location to minimize cache
+      // traffic.  This assumes that stack locations are very likely to be
+      // accessed only by the owning thread. 
+      SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
+      DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain);
+      return SDValue();
+    }
+    // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+    SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
+    DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain);
+    return SDValue();
+  }
+
   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
   // RAUW the chain, but don't worry about the result, as it's unused.
   assert(!N->hasAnyUseOfValue(0));

Modified: llvm/trunk/test/CodeGen/X86/atomic-idempotent.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/atomic-idempotent.ll?rev=360393&r1=360392&r2=360393&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/atomic-idempotent.ll (original)
+++ llvm/trunk/test/CodeGen/X86/atomic-idempotent.ll Thu May  9 16:23:42 2019
@@ -166,70 +166,38 @@ define i32 @and32 (i32* %p) {
 }
 
 define void @or32_nouse_monotonic(i32* %p) {
-; X64-LABEL: or32_nouse_monotonic:
-; X64:       # %bb.0:
-; X64-NEXT:    mfence
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: or32_nouse_monotonic:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    mfence
-; X32-NEXT:    movl (%eax), %eax
-; X32-NEXT:    retl
+; CHECK-LABEL: or32_nouse_monotonic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    #MEMBARRIER
+; CHECK-NEXT:    ret{{[l|q]}}
   atomicrmw or i32* %p, i32 0 monotonic
   ret void
 }
 
 
 define void @or32_nouse_acquire(i32* %p) {
-; X64-LABEL: or32_nouse_acquire:
-; X64:       # %bb.0:
-; X64-NEXT:    mfence
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: or32_nouse_acquire:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    mfence
-; X32-NEXT:    movl (%eax), %eax
-; X32-NEXT:    retl
+; CHECK-LABEL: or32_nouse_acquire:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    #MEMBARRIER
+; CHECK-NEXT:    ret{{[l|q]}}
   atomicrmw or i32* %p, i32 0 acquire
   ret void
 }
 
 define void @or32_nouse_release(i32* %p) {
-; X64-LABEL: or32_nouse_release:
-; X64:       # %bb.0:
-; X64-NEXT:    mfence
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: or32_nouse_release:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    mfence
-; X32-NEXT:    movl (%eax), %eax
-; X32-NEXT:    retl
+; CHECK-LABEL: or32_nouse_release:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    #MEMBARRIER
+; CHECK-NEXT:    ret{{[l|q]}}
   atomicrmw or i32* %p, i32 0 release
   ret void
 }
 
 define void @or32_nouse_acq_rel(i32* %p) {
-; X64-LABEL: or32_nouse_acq_rel:
-; X64:       # %bb.0:
-; X64-NEXT:    mfence
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: or32_nouse_acq_rel:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    mfence
-; X32-NEXT:    movl (%eax), %eax
-; X32-NEXT:    retl
+; CHECK-LABEL: or32_nouse_acq_rel:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    #MEMBARRIER
+; CHECK-NEXT:    ret{{[l|q]}}
   atomicrmw or i32* %p, i32 0 acq_rel
   ret void
 }
@@ -237,15 +205,12 @@ define void @or32_nouse_acq_rel(i32* %p)
 define void @or32_nouse_seq_cst(i32* %p) {
 ; X64-LABEL: or32_nouse_seq_cst:
 ; X64:       # %bb.0:
-; X64-NEXT:    mfence
-; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    lock orl $0, (%rsp)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: or32_nouse_seq_cst:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    mfence
-; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    lock orl $0, (%esp)
 ; X32-NEXT:    retl
   atomicrmw or i32* %p, i32 0 seq_cst
   ret void
@@ -255,8 +220,7 @@ define void @or32_nouse_seq_cst(i32* %p)
 define void @or64_nouse_seq_cst(i64* %p) {
 ; X64-LABEL: or64_nouse_seq_cst:
 ; X64:       # %bb.0:
-; X64-NEXT:    mfence
-; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    lock orl $0, (%rsp)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: or64_nouse_seq_cst:
@@ -334,15 +298,12 @@ define void @or128_nouse_seq_cst(i128* %
 define void @or16_nouse_seq_cst(i16* %p) {
 ; X64-LABEL: or16_nouse_seq_cst:
 ; X64:       # %bb.0:
-; X64-NEXT:    mfence
-; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    lock orl $0, (%rsp)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: or16_nouse_seq_cst:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    mfence
-; X32-NEXT:    movzwl (%eax), %eax
+; X32-NEXT:    lock orl $0, (%esp)
 ; X32-NEXT:    retl
   atomicrmw or i16* %p, i16 0 seq_cst
   ret void
@@ -351,15 +312,12 @@ define void @or16_nouse_seq_cst(i16* %p)
 define void @or8_nouse_seq_cst(i8* %p) {
 ; X64-LABEL: or8_nouse_seq_cst:
 ; X64:       # %bb.0:
-; X64-NEXT:    mfence
-; X64-NEXT:    movb (%rdi), %al
+; X64-NEXT:    lock orl $0, (%rsp)
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: or8_nouse_seq_cst:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    mfence
-; X32-NEXT:    movb (%eax), %al
+; X32-NEXT:    lock orl $0, (%esp)
 ; X32-NEXT:    retl
   atomicrmw or i8* %p, i8 0 seq_cst
   ret void