[llvm-branch-commits] [llvm] release/19.x: [AArch64] Don't replace dst of SWP instructions with (X|W)ZR (#102139) (PR #102316)

Sat Aug 10 03:07:35 PDT 2024

https://github.com/tru updated https://github.com/llvm/llvm-project/pull/102316

>From c25c15edb1cc58796ede2071d4fdbdd1575bcd66 Mon Sep 17 00:00:00 2001
From: Lucas Duarte Prates <lucas.prates at arm.com>
Date: Wed, 7 Aug 2024 15:15:25 +0100
Subject: [PATCH] [AArch64] Don't replace dst of SWP instructions with (X|W)ZR
 (#102139)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change updates the AArch64DeadRegisterDefinition pass to ensure it
does not replace the destination register of a SWP instruction with the
zero register when its value is unused. This is necessary to ensure that
the ordering of such instructions in relation to DMB.LD barries adheres
to the definitions of the AArch64 Memory Model.

The memory model states the following (ARMARM version DDI 0487K.a
§B2.3.7):
```
Barrier-ordered-before

An effect E1 is Barrier-ordered-before an effect E2 if one of the following applies:
[...]
* All of the following apply:
- E1 is a Memory Read effect.
- E1 is generated by an instruction whose destination register is not WZR or XZR.
- E1 appears in program order before E3.
- E3 is either a DMB LD effect or a DSB LD effect.
- E3 appears in program order before E2.
```

Prior to this change, by replacing the destination register of such SWP
instruction with WZR/XZR, the ordering relation described above was
incorrectly removed from the generated code.

The new behaviour is ensured in this patch by adding the relevant
`SWP[L](B|H|W|X)` instructions to list in the `atomicReadDroppedOnZero`
predicate, which already covered the `LD<Op>` instructions that are
subject to the same effect.

Fixes #68428.

(cherry picked from commit beb37e2e22b549b361be7269a52a3715649e956a)
---
 .../AArch64DeadRegisterDefinitionsPass.cpp    |  4 ++
 .../Atomics/aarch64-atomic-exchange-fence.ll  | 64 +++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-exchange-fence.ll

diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 2bc14f9821e639..161cf24dd4037f 100644
--- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -108,6 +108,10 @@ static bool atomicReadDroppedOnZero(unsigned Opcode) {
     case AArch64::LDUMINW:    case AArch64::LDUMINX:
     case AArch64::LDUMINLB:   case AArch64::LDUMINLH:
     case AArch64::LDUMINLW:   case AArch64::LDUMINLX:
+    case AArch64::SWPB:       case AArch64::SWPH:
+    case AArch64::SWPW:       case AArch64::SWPX:
+    case AArch64::SWPLB:      case AArch64::SWPLH:
+    case AArch64::SWPLW:      case AArch64::SWPLX:
     return true;
   }
   return false;
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-exchange-fence.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-exchange-fence.ll
new file mode 100644
index 00000000000000..2adbc709d238da
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-exchange-fence.ll
@@ -0,0 +1,64 @@
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse -O0 | FileCheck %s
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse -O1 | FileCheck %s
+
+; When their destination register is WZR/ZZR, SWP operations are not regarded as
+; a read for the purpose of a DMB.LD in the AArch64 memory model.
+; This test ensures that the AArch64DeadRegisterDefinitions pass does not
+; replace the desitnation register of SWP instructions with the zero register
+; when the read value is unused.
+
+define dso_local i32 @atomic_exchange_monotonic(ptr %ptr, ptr %ptr2, i32 %value) {
+; CHECK-LABEL: atomic_exchange_monotonic:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    swp
+; CHECK-NOT:     wzr
+; CHECK-NEXT:    dmb ishld
+; CHECK-NEXT:    ldr w0, [x1]
+; CHECK-NEXT:    ret
+    %r0 = atomicrmw xchg ptr %ptr, i32 %value monotonic
+    fence acquire
+    %r1 = load atomic i32, ptr %ptr2 monotonic, align 4
+    ret i32 %r1
+}
+
+define dso_local i32 @atomic_exchange_acquire(ptr %ptr, ptr %ptr2, i32 %value) {
+; CHECK-LABEL: atomic_exchange_acquire:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    swpa
+; CHECK-NOT:     wzr
+; CHECK-NEXT:    dmb ishld
+; CHECK-NEXT:    ldr w0, [x1]
+; CHECK-NEXT:    ret
+    %r0 = atomicrmw xchg ptr %ptr, i32 %value acquire
+    fence acquire
+    %r1 = load atomic i32, ptr %ptr2 monotonic, align 4
+    ret i32 %r1
+}
+
+define dso_local i32 @atomic_exchange_release(ptr %ptr, ptr %ptr2, i32 %value) {
+; CHECK-LABEL: atomic_exchange_release:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    swpl
+; CHECK-NOT:     wzr
+; CHECK-NEXT:    dmb ishld
+; CHECK-NEXT:    ldr w0, [x1]
+; CHECK-NEXT:    ret
+    %r0 = atomicrmw xchg ptr %ptr, i32 %value release
+    fence acquire
+    %r1 = load atomic i32, ptr %ptr2 monotonic, align 4
+    ret i32 %r1
+}
+
+define dso_local i32 @atomic_exchange_acquire_release(ptr %ptr, ptr %ptr2, i32 %value) {
+; CHECK-LABEL: atomic_exchange_acquire_release:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    swpal
+; CHECK-NOT:     wzr
+; CHECK-NEXT:    dmb ishld
+; CHECK-NEXT:    ldr w0, [x1]
+; CHECK-NEXT:    ret
+    %r0 = atomicrmw xchg ptr %ptr, i32 %value acq_rel
+    fence acquire
+    %r1 = load atomic i32, ptr %ptr2 monotonic, align 4
+    ret i32 %r1
+}