[llvm] c9821ab - [WoA] Use fences for sequentially consistent stores/writes

Mon Jan 23 16:09:58 PST 2023

Author: Nadeem, Usman
Date: 2023-01-23T16:09:11-08:00
New Revision: c9821abfc023fba684c8ef8589c49cba8083f579

URL: https://github.com/llvm/llvm-project/commit/c9821abfc023fba684c8ef8589c49cba8083f579
DIFF: https://github.com/llvm/llvm-project/commit/c9821abfc023fba684c8ef8589c49cba8083f579.diff

LOG: [WoA] Use fences for sequentially consistent stores/writes

LLVM currently uses LDAR/STLR and variants for acquire/release
as well as seq_cst operations. This is fine as long as all code uses
this convention.

Normally LDAR/STLR act as one way barriers but when used in
combination they provide a sequentially consistent model. i.e.
when an LDAR appears after an STLR in program order the STLR
acts as a two way fence and the store will be observed before
the load.

The problem is that normal loads (unlike ldar), when they appear
after the STLR can be observed before STLR (if my understanding
is correct). Possibly providing weaker than expected guarantees if
they are used for ordered atomic operations.

Unfortunately in Microsoft Visual Studio STL seq_cst ld/st are
implemented using normal load/stores and explicit fences:
dmb ish + str + dmb ish
ldr + dmb ish

This patch uses fences for MSVC target whenever we write to the
memory in a sequentially consistent way so that we don't rely on
the assumptions that just using LDAR/STLR will give us sequentially
consistent ordering.

Differential Revision: https://reviews.llvm.org/D141748

Change-Id: I48f3500ff8ec89677c9f089ce58181bd139bc68a

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/AtomicExpandPass.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8317a2e146a08..639d48e342ef3 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1983,6 +1983,13 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Whether AtomicExpandPass should automatically insert a trailing fence
+  /// without reducing the ordering for this atomic. Defaults to false.
+  virtual bool
+  shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const {
+    return false;
+  }
+
   /// Perform a load-linked operation on Addr, returning a "Value *" with the
   /// corresponding pointee type. This may entail some non-trivial operations to
   /// truncate or reconstruct types that will be illegal in the backend. See

diff  --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 64b72e9ac2300..c8bea66cb08b7 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -289,6 +289,24 @@ bool AtomicExpand::runOnFunction(Function &F) {
       if (FenceOrdering != AtomicOrdering::Monotonic) {
         MadeChange |= bracketInstWithFences(I, FenceOrdering);
       }
+    } else if (I->hasAtomicStore() &&
+               TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
+      auto FenceOrdering = AtomicOrdering::Monotonic;
+      if (SI)
+        FenceOrdering = SI->getOrdering();
+      else if (RMWI)
+        FenceOrdering = RMWI->getOrdering();
+      else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) !=
+                           TargetLoweringBase::AtomicExpansionKind::LLSC)
+        // LLSC is handled in expandAtomicCmpXchg().
+        FenceOrdering = CASI->getSuccessOrdering();
+
+      IRBuilder Builder(I);
+      if (auto TrailingFence =
+              TLI->emitTrailingFence(Builder, I, FenceOrdering)) {
+        TrailingFence->moveAfter(I);
+        MadeChange = true;
+      }
     }
 
     if (LI)
@@ -1356,7 +1374,8 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   // Make sure later instructions don't get reordered with a fence if
   // necessary.
   Builder.SetInsertPoint(SuccessBB);
-  if (ShouldInsertFencesForAtomic)
+  if (ShouldInsertFencesForAtomic ||
+      TLI->shouldInsertTrailingFenceForAtomicStore(CI))
     TLI->emitTrailingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(ExitBB);
 

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 95c9db7c35906..abe4ef7447142 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22300,6 +22300,30 @@ bool AArch64TargetLowering::shouldInsertFencesForAtomic(
   return isOpSuitableForLDPSTP(I);
 }
 
+bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
+    const Instruction *I) const {
+  // Store-Release instructions only provide seq_cst guarantees when paired with
+  // Load-Acquire instructions. MSVC CRT does not use these instructions to
+  // implement seq_cst loads and stores, so we need additional explicit fences
+  // after memory writes.
+  if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+    return false;
+
+  switch (I->getOpcode()) {
+  default:
+    return false;
+  case Instruction::AtomicCmpXchg:
+    return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
+           AtomicOrdering::SequentiallyConsistent;
+  case Instruction::AtomicRMW:
+    return cast<AtomicRMWInst>(I)->getOrdering() ==
+           AtomicOrdering::SequentiallyConsistent;
+  case Instruction::Store:
+    return cast<StoreInst>(I)->getOrdering() ==
+           AtomicOrdering::SequentiallyConsistent;
+  }
+}
+
 // Loads and stores less than 128-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong.

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 99965ce1902e1..0edec721ed879 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -706,6 +706,8 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool isOpSuitableForLDPSTP(const Instruction *I) const;
   bool shouldInsertFencesForAtomic(const Instruction *I) const override;
+  bool
+  shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override;
 
   TargetLoweringBase::AtomicExpansionKind
   shouldExpandAtomicLoadInIR(LoadInst *LI) const override;

diff  --git a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll
index 73dedfea6be48..762e70a6e78c1 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll
@@ -19,6 +19,7 @@ define dso_local i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw add ptr @var8, i8 %offset seq_cst
    ret i8 %old
@@ -135,16 +136,17 @@ define dso_local i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
 define dso_local i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_sub_i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    adrp x9, var64
 ; CHECK-NEXT:    add x9, x9, :lo12:var64
 ; CHECK-NEXT:  .LBB7_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldaxr x8, [x9]
-; CHECK-NEXT:    sub x10, x8, x0
+; CHECK-NEXT:    ldaxr x0, [x9]
+; CHECK-NEXT:    sub x10, x0, x8
 ; CHECK-NEXT:    stlxr w11, x10, [x9]
 ; CHECK-NEXT:    cbnz w11, .LBB7_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw sub ptr @var64, i64 %offset seq_cst
    ret i64 %old
@@ -199,6 +201,7 @@ define dso_local i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB10_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw and ptr @var32, i32 %offset seq_cst
    ret i32 %old
@@ -235,6 +238,7 @@ define dso_local i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB12_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw or ptr @var8, i8 %offset seq_cst
    ret i8 %old
@@ -343,6 +347,7 @@ define dso_local i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB18_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw xor ptr @var32, i32 %offset seq_cst
    ret i32 %old
@@ -397,6 +402,7 @@ define dso_local i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w10, .LBB21_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw xchg ptr @var16, i16 %offset seq_cst
    ret i16 %old
@@ -500,17 +506,18 @@ define dso_local i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
 define dso_local i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_min_i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    adrp x9, var64
 ; CHECK-NEXT:    add x9, x9, :lo12:var64
 ; CHECK-NEXT:  .LBB27_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldaxr x8, [x9]
-; CHECK-NEXT:    cmp x8, x0
-; CHECK-NEXT:    csel x10, x8, x0, le
+; CHECK-NEXT:    ldaxr x0, [x9]
+; CHECK-NEXT:    cmp x0, x8
+; CHECK-NEXT:    csel x10, x0, x8, le
 ; CHECK-NEXT:    stlxr w11, x10, [x9]
 ; CHECK-NEXT:    cbnz w11, .LBB27_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw min ptr @var64, i64 %offset seq_cst
    ret i64 %old
@@ -531,6 +538,7 @@ define dso_local i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB28_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw max ptr @var8, i8 %offset seq_cst
    ret i8 %old
@@ -648,6 +656,7 @@ define dso_local i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB34_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw umin ptr @var32, i32 %offset seq_cst
    ret i32 %old
@@ -726,6 +735,7 @@ define dso_local i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
 ; CHECK-NEXT:    cbnz w11, .LBB38_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
    %old = atomicrmw umax ptr @var32, i32 %offset seq_cst
    ret i32 %old
@@ -794,7 +804,8 @@ define dso_local i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; CHECK-NEXT:    // in Loop: Header=BB41_1 Depth=1
 ; CHECK-NEXT:    stlxrh w10, w1, [x9]
 ; CHECK-NEXT:    cbnz w10, .LBB41_1
-; CHECK-NEXT:  // %bb.3: // %cmpxchg.end
+; CHECK-NEXT:  // %bb.3: // %cmpxchg.success
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB41_4: // %cmpxchg.nostore
@@ -972,6 +983,7 @@ define dso_local void @test_atomic_store_seq_cst_i8(i8 %val) nounwind {
 ; CHECK-NEXT:    adrp x8, var8
 ; CHECK-NEXT:    add x8, x8, :lo12:var8
 ; CHECK-NEXT:    stlrb w0, [x8]
+; CHECK-NEXT:    dmb ish
 ; CHECK-NEXT:    ret
   store atomic i8 %val, ptr @var8 seq_cst, align 1
   ret void