[clang] [clang][ARM] Fix build failure in <arm_acle.h> for __swp (PR #151354)

Fri Aug 1 07:00:54 PDT 2025

https://github.com/statham-arm updated https://github.com/llvm/llvm-project/pull/151354

>From 7cc5fed6d24f78114607309a5d1d0aba3a1553e4 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Wed, 30 Jul 2025 13:46:30 +0100
Subject: [PATCH 1/4] [clang][ARM] Fix build failure in <arm_acle.h> for __swp

In commit d5985905ae8e5b2 I introduced a Sema check that prohibits
`__builtin_arm_ldrex` and `__builtin_arm_strex` for data sizes not
supported by the target architecture version. However, `arm_acle.h`
unconditionally uses those builtins with a 32-bit data size. So now
including that header will cause a build failure on Armv6-M, or
historic architectures like Armv5.

To fix it, `arm_acle.h` now queries the compiler-defined
`__ARM_FEATURE_LDREX` macro (also fixed recently in commit
34f59d79209268e so that it matches the target architecture). If 32-bit
LDREX isn't available it will fall back to the older SWP instruction,
or failing that (on Armv6-M), a libcall.

While I was modifying the header anyway, I also renamed the local
variable `v` inside `__swp` so that it starts with `__`, avoiding any
risk of user code having #defined `v`.
---
 clang/lib/Headers/arm_acle.h      | 22 +++++++++++++++++++---
 clang/test/CodeGen/arm_acle_swp.c | 19 +++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CodeGen/arm_acle_swp.c

diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index 5cfa3d023a7d5..a74144baadaba 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -55,11 +55,27 @@ __chkfeat(uint64_t __features) {
 /* 7.5 Swap */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __swp(uint32_t __x, volatile uint32_t *__p) {
-  uint32_t v;
+  uint32_t __v;
+#if __ARM_FEATURE_LDREX & 4
   do
-    v = __builtin_arm_ldrex(__p);
+    __v = __builtin_arm_ldrex(__p);
   while (__builtin_arm_strex(__x, __p));
-  return v;
+#elif !__ARM_ARCH_6M__
+  /* Fall back to the deprecated SWP instruction, on historic architecture
+   * versions without load/store exclusive instructions on 32-bit data. ACLE is
+   * clear that we mustn't use SWP in any _other_ situation, but permits us to
+   * use it if there's no other option. */
+  __asm__("swp %0, %1, [%2]" : "=r"(__v) : "r"(__x), "r"(__p) : "memory");
+#else
+  /* Armv6-M doesn't have either of LDREX or SWP. ACLE suggests this
+   * implementation, which Clang lowers to the 'cmpxchg' operation in LLVM IR.
+   * On Armv6-M, LLVM turns that into a libcall to __atomic_compare_exchange_4,
+   * so the runtime will need to implement that. */
+  do
+    __v = *__p;
+  while (__sync_bool_compare_and_swap(__p, __v, __x));
+#endif
+  return __v;
 }
 
 /* 7.6 Memory prefetch intrinsics */
diff --git a/clang/test/CodeGen/arm_acle_swp.c b/clang/test/CodeGen/arm_acle_swp.c
new file mode 100644
index 0000000000000..3a99737163afe
--- /dev/null
+++ b/clang/test/CodeGen/arm_acle_swp.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -ffreestanding -triple thumbv7m-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LDREX
+// RUN: %clang_cc1 -ffreestanding -triple armv7a-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LDREX
+// RUN: %clang_cc1 -ffreestanding -triple armv6-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LDREX
+// RUN: %clang_cc1 -ffreestanding -triple thumbv6m-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=SYNC
+// RUN: %clang_cc1 -ffreestanding -triple armv5-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=SWP
+
+// REQUIRES: arm-registered-target
+
+#include <arm_acle.h>
+
+// LDREX: call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) {{.*}})
+// LDREX: call i32 @llvm.arm.strex.p0(i32 {{.*}}, ptr elementtype(i32) {{.*}})
+
+// SWP:   call i32 asm "swp $0, $1, [$2]", "=r,r,r,~{memory}"(i32 {{.*}}, ptr {{.*}})
+
+// SYNC:  cmpxchg ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+uint32_t test_swp(uint32_t x, volatile void *p) {
+  return __swp(x, p);
+}

>From f95f6076c91bbdee5220d7d41f2b549d206bde47 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Thu, 31 Jul 2025 09:58:01 +0100
Subject: [PATCH 2/4] Use __atomic_exchange_n to improve the v6-M case

---
 clang/lib/Headers/arm_acle.h      | 11 ++++-------
 clang/test/CodeGen/arm_acle_swp.c | 10 +++++-----
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index a74144baadaba..d769e8310847a 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -67,13 +67,10 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
    * use it if there's no other option. */
   __asm__("swp %0, %1, [%2]" : "=r"(__v) : "r"(__x), "r"(__p) : "memory");
 #else
-  /* Armv6-M doesn't have either of LDREX or SWP. ACLE suggests this
-   * implementation, which Clang lowers to the 'cmpxchg' operation in LLVM IR.
-   * On Armv6-M, LLVM turns that into a libcall to __atomic_compare_exchange_4,
-   * so the runtime will need to implement that. */
-  do
-    __v = *__p;
-  while (__sync_bool_compare_and_swap(__p, __v, __x));
+  /* Armv6-M doesn't have either of LDREX or SWP. LLVM turns the following
+   * builtin into a libcall to __atomic_exchange_4, so the runtime will need to
+   * implement that. */
+  __v = __atomic_exchange_n(__p, __x, __ATOMIC_RELAXED);
 #endif
   return __v;
 }
diff --git a/clang/test/CodeGen/arm_acle_swp.c b/clang/test/CodeGen/arm_acle_swp.c
index 3a99737163afe..fe87b8c1f7230 100644
--- a/clang/test/CodeGen/arm_acle_swp.c
+++ b/clang/test/CodeGen/arm_acle_swp.c
@@ -1,19 +1,19 @@
 // RUN: %clang_cc1 -ffreestanding -triple thumbv7m-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LDREX
 // RUN: %clang_cc1 -ffreestanding -triple armv7a-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LDREX
 // RUN: %clang_cc1 -ffreestanding -triple armv6-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LDREX
-// RUN: %clang_cc1 -ffreestanding -triple thumbv6m-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=SYNC
+// RUN: %clang_cc1 -ffreestanding -triple thumbv6m-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=ATOMIC
 // RUN: %clang_cc1 -ffreestanding -triple armv5-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=SWP
 
 // REQUIRES: arm-registered-target
 
 #include <arm_acle.h>
 
-// LDREX: call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) {{.*}})
-// LDREX: call i32 @llvm.arm.strex.p0(i32 {{.*}}, ptr elementtype(i32) {{.*}})
+// LDREX:  call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) {{.*}})
+// LDREX:  call i32 @llvm.arm.strex.p0(i32 {{.*}}, ptr elementtype(i32) {{.*}})
 
-// SWP:   call i32 asm "swp $0, $1, [$2]", "=r,r,r,~{memory}"(i32 {{.*}}, ptr {{.*}})
+// SWP:    call i32 asm "swp $0, $1, [$2]", "=r,r,r,~{memory}"(i32 {{.*}}, ptr {{.*}})
 
-// SYNC:  cmpxchg ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+// ATOMIC: atomicrmw volatile xchg ptr {{.*}}, i32 {{.*}} monotonic, align 4
 uint32_t test_swp(uint32_t x, volatile void *p) {
   return __swp(x, p);
 }

>From bfa6a3d73b7c542017d40e3ea458346a8fd2af78 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Fri, 1 Aug 2025 11:55:26 +0100
Subject: [PATCH 3/4] Use __atomic_exchange_n for modern systems and Linux too

---
 clang/lib/Headers/arm_acle.h | 41 ++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index d769e8310847a..fcc2075121b44 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -56,21 +56,34 @@ __chkfeat(uint64_t __features) {
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __swp(uint32_t __x, volatile uint32_t *__p) {
   uint32_t __v;
-#if __ARM_FEATURE_LDREX & 4
-  do
-    __v = __builtin_arm_ldrex(__p);
-  while (__builtin_arm_strex(__x, __p));
-#elif !__ARM_ARCH_6M__
-  /* Fall back to the deprecated SWP instruction, on historic architecture
-   * versions without load/store exclusive instructions on 32-bit data. ACLE is
-   * clear that we mustn't use SWP in any _other_ situation, but permits us to
-   * use it if there's no other option. */
-  __asm__("swp %0, %1, [%2]" : "=r"(__v) : "r"(__x), "r"(__p) : "memory");
-#else
-  /* Armv6-M doesn't have either of LDREX or SWP. LLVM turns the following
-   * builtin into a libcall to __atomic_exchange_4, so the runtime will need to
-   * implement that. */
+#if (__ARM_FEATURE_LDREX & 4) || __ARM_ARCH_6M__ || __linux__
+  /*
+   * Using this clang builtin is sensible in most situations. Where
+   * LDREX and STREX are available, it will compile to a loop using
+   * them. Otherwise it will compile to a libcall, requiring the
+   * runtime to provide that library function.
+   *
+   * That's unavoidable on Armv6-M, which has no atomic instructions
+   * at all (not even SWP), so in that situation the user will just
+   * have to provide an implementation of __atomic_exchange_4 (perhaps
+   * it would temporarily disable interrupts, and then do a separate
+   * load and store).
+   *
+   * We also use the libcall strategy on pre-Armv7 Linux targets, on
+   * the theory that Linux's runtime support library _will_ provide a
+   * suitable libcall, and it's better to use that than the SWP
+   * instruction because then when the same binary is run on a later
+   * Linux system the libcall implementation will use LDREX instead.
+   */
   __v = __atomic_exchange_n(__p, __x, __ATOMIC_RELAXED);
+#else
+  /*
+   * But for older Arm architectures when the target is not Linux, we
+   * fall back to using the SWP instruction via inline assembler. ACLE
+   * is clear that we're allowed to do this, but shouldn't do it if we
+   * have a better alternative.
+   */
+  __asm__("swp %0, %1, [%2]" : "=r"(__v) : "r"(__x), "r"(__p) : "memory");
 #endif
   return __v;
 }

>From 4175aa855ba1179f64203d41f33932b58b337d76 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Fri, 1 Aug 2025 14:52:31 +0100
Subject: [PATCH 4/4] Fix tests (oops)

---
 clang/test/CodeGen/arm_acle.c     | 27 ++++-----------------------
 clang/test/CodeGen/arm_acle_swp.c | 10 ++++------
 2 files changed, 8 insertions(+), 29 deletions(-)

diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c
index 74de8246d7de6..0f539cba5c758 100644
--- a/clang/test/CodeGen/arm_acle.c
+++ b/clang/test/CodeGen/arm_acle.c
@@ -139,29 +139,10 @@ void test_dbg(void) {
 #endif
 
 /* 8.5 Swap */
-// AArch32-LABEL: @test_swp(
-// AArch32-NEXT:  entry:
-// AArch32-NEXT:    br label [[DO_BODY_I:%.*]]
-// AArch32:       do.body.i:
-// AArch32-NEXT:    [[LDREX_I:%.*]] = call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) [[P:%.*]])
-// AArch32-NEXT:    [[STREX_I:%.*]] = call i32 @llvm.arm.strex.p0(i32 [[X:%.*]], ptr elementtype(i32) [[P]])
-// AArch32-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[STREX_I]], 0
-// AArch32-NEXT:    br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
-// AArch32:       __swp.exit:
-// AArch32-NEXT:    ret void
-//
-// AArch64-LABEL: @test_swp(
-// AArch64-NEXT:  entry:
-// AArch64-NEXT:    br label [[DO_BODY_I:%.*]]
-// AArch64:       do.body.i:
-// AArch64-NEXT:    [[LDXR_I:%.*]] = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i32) [[P:%.*]])
-// AArch64-NEXT:    [[TMP0:%.*]] = trunc i64 [[LDXR_I]] to i32
-// AArch64-NEXT:    [[TMP1:%.*]] = zext i32 [[X:%.*]] to i64
-// AArch64-NEXT:    [[STXR_I:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP1]], ptr elementtype(i32) [[P]])
-// AArch64-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[STXR_I]], 0
-// AArch64-NEXT:    br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
-// AArch64:       __swp.exit:
-// AArch64-NEXT:    ret void
+// ARM-LABEL: @test_swp(
+// ARM-NEXT:  entry:
+// ARM-NEXT:    [[TMP0:%.*]] = atomicrmw volatile xchg ptr [[P:%.*]], i32 [[X:%.*]] monotonic, align 4
+// ARM-NEXT:    ret void
 //
 void test_swp(uint32_t x, volatile void *p) {
   __swp(x, p);
diff --git a/clang/test/CodeGen/arm_acle_swp.c b/clang/test/CodeGen/arm_acle_swp.c
index fe87b8c1f7230..15fb49d06e631 100644
--- a/clang/test/CodeGen/arm_acle_swp.c
+++ b/clang/test/CodeGen/arm_acle_swp.c
@@ -1,16 +1,14 @@
-// RUN: %clang_cc1 -ffreestanding -triple thumbv7m-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LDREX
-// RUN: %clang_cc1 -ffreestanding -triple armv7a-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LDREX
-// RUN: %clang_cc1 -ffreestanding -triple armv6-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=LDREX
+// RUN: %clang_cc1 -ffreestanding -triple thumbv7m-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=ATOMIC
+// RUN: %clang_cc1 -ffreestanding -triple armv7a-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=ATOMIC
+// RUN: %clang_cc1 -ffreestanding -triple armv6-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=ATOMIC
 // RUN: %clang_cc1 -ffreestanding -triple thumbv6m-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=ATOMIC
+// RUN: %clang_cc1 -ffreestanding -triple armv5-unknown-linux-gnu -target-abi aapcs -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=ATOMIC
 // RUN: %clang_cc1 -ffreestanding -triple armv5-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefix=SWP
 
 // REQUIRES: arm-registered-target
 
 #include <arm_acle.h>
 
-// LDREX:  call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) {{.*}})
-// LDREX:  call i32 @llvm.arm.strex.p0(i32 {{.*}}, ptr elementtype(i32) {{.*}})
-
 // SWP:    call i32 asm "swp $0, $1, [$2]", "=r,r,r,~{memory}"(i32 {{.*}}, ptr {{.*}})
 
 // ATOMIC: atomicrmw volatile xchg ptr {{.*}}, i32 {{.*}} monotonic, align 4