[clang] [llvm] [BPF] Do atomic_fetch_*() pattern matching with memory ordering (PR #107343)

via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 24 12:32:12 PDT 2024


https://github.com/yonghong-song updated https://github.com/llvm/llvm-project/pull/107343

>From 844342c25c67b66c89c0a82d49035492fab24461 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Tue, 3 Sep 2024 21:26:17 -0700
Subject: [PATCH 1/3] [BPF] Do atomic_fetch_*() pattern matching with memory
 ordering

For atomic fetch_and_*() operations, do pattern matching with memory ordering
seq_cst, acq_rel, release, acquire and monotonic (relaxed). For fetch_and_*()
operations with seq_cst/acq_rel/release/acquire ordering, atomic_fetch_*()
instructions are generated. For monotonic ordering, locked insns are generated
if return value is not used. Otherwise, atomic_fetch_*() insns are used.
The main motivation is to resolve the kernel issue [1].

The following are memory ordering are supported:
  seq_cst, acq_rel, release, acquire, relaxed
Current gcc style __sync_fetch_and_*() operations are all seq_cst.

To use explicit memory ordering, the _Atomic type is needed.  The following is
an example:

```
$ cat test.c
\#include <stdatomic.h>
void f1(_Atomic int *i) {
   (void)__c11_atomic_fetch_and(i, 10, memory_order_relaxed);
}
void f2(_Atomic int *i) {
   (void)__c11_atomic_fetch_and(i, 10, memory_order_acquire);
}
void f3(_Atomic int *i) {
   (void)__c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
}
$ cat run.sh
clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -c test.c -o test.o && llvm-objdump -d test.o
$ ./run.sh

test.o: file format elf64-bpf

Disassembly of section .text:

0000000000000000 <f1>:
       0:       b4 02 00 00 0a 00 00 00 w2 = 0xa
       1:       c3 21 00 00 50 00 00 00 lock *(u32 *)(r1 + 0x0) &= w2
       2:       95 00 00 00 00 00 00 00 exit

0000000000000018 <f2>:
       3:       b4 02 00 00 0a 00 00 00 w2 = 0xa
       4:       c3 21 00 00 51 00 00 00 w2 = atomic_fetch_and((u32 *)(r1 + 0x0), w2)
       5:       95 00 00 00 00 00 00 00 exit

0000000000000030 <f3>:
       6:       b4 02 00 00 0a 00 00 00 w2 = 0xa
       7:       c3 21 00 00 51 00 00 00 w2 = atomic_fetch_and((u32 *)(r1 + 0x0), w2)
       8:       95 00 00 00 00 00 00 00 exit
```

The following is another example where return value is used:

```
$ cat test1.c
\#include <stdatomic.h>
int f1(_Atomic int *i) {
   return __c11_atomic_fetch_and(i, 10, memory_order_relaxed);
}
int f2(_Atomic int *i) {
   return __c11_atomic_fetch_and(i, 10, memory_order_acquire);
}
int f3(_Atomic int *i) {
   return __c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
}
$ cat run.sh
clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -c test1.c -o test1.o && llvm-objdump -d test1.o
$ ./run.sh

test.o: file format elf64-bpf

Disassembly of section .text:

0000000000000000 <f1>:
       0:       b4 00 00 00 0a 00 00 00 w0 = 0xa
       1:       c3 01 00 00 51 00 00 00 w0 = atomic_fetch_and((u32 *)(r1 + 0x0), w0)
       2:       95 00 00 00 00 00 00 00 exit

0000000000000018 <f2>:
       3:       b4 00 00 00 0a 00 00 00 w0 = 0xa
       4:       c3 01 00 00 51 00 00 00 w0 = atomic_fetch_and((u32 *)(r1 + 0x0), w0)
       5:       95 00 00 00 00 00 00 00 exit

0000000000000030 <f3>:
       6:       b4 00 00 00 0a 00 00 00 w0 = 0xa
       7:       c3 01 00 00 51 00 00 00 w0 = atomic_fetch_and((u32 *)(r1 + 0x0), w0)
       8:       95 00 00 00 00 00 00 00 exit
```

You can see that for relaxed memory ordering, if return value is used, atomic_fetch_and()
insn is used. Otherwise, if return value is not used, locked insn is used.

Here is another example with global _Atomic variable:

```
$ cat test3.c
\#include <stdatomic.h>

_Atomic int i;

void f1(void) {
   (void)__c11_atomic_fetch_and(&i, 10, memory_order_relaxed);
}
void f2(void) {
   (void)__c11_atomic_fetch_and(&i, 10, memory_order_seq_cst);
}
$ cat run.sh
clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -c test3.c -o test3.o && llvm-objdump -d test3.o
$ ./run.sh

test3.o:        file format elf64-bpf

Disassembly of section .text:

0000000000000000 <f1>:
       0:       b4 01 00 00 0a 00 00 00 w1 = 0xa
       1:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
       3:       c3 12 00 00 50 00 00 00 lock *(u32 *)(r2 + 0x0) &= w1
       4:       95 00 00 00 00 00 00 00 exit

0000000000000028 <f2>:
       5:       b4 01 00 00 0a 00 00 00 w1 = 0xa
       6:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
       8:       c3 12 00 00 51 00 00 00 w1 = atomic_fetch_and((u32 *)(r2 + 0x0), w1)
       9:       95 00 00 00 00 00 00 00 exit
```

Note that in the above compilations, '-g' is not used. The reason is due to the following IR
related to _Atomic type:
```
$clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -g -S -emit-llvm test3.c
```
The related debug info for test3.c:
```
!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
!1 = distinct !DIGlobalVariable(name: "i", scope: !2, file: !3, line: 3, type: !16, isLocal: false, isDefinition: true)
...
!16 = !DIDerivedType(tag: DW_TAG_atomic_type, baseType: !17)
!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
```

If compiling test.c, the related debug info:
```
...
!19 = distinct !DISubprogram(name: "f1", scope: !1, file: !1, line: 3, type: !20, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !25)
!20 = !DISubroutineType(types: !21)
!21 = !{null, !22}
!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
!23 = !DIDerivedType(tag: DW_TAG_atomic_type, baseType: !24)
!24 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!25 = !{!26}
!26 = !DILocalVariable(name: "i", arg: 1, scope: !19, file: !1, line: 3, type: !22)
```

All the above suggests _Atomic behaves like a modifier (e.g. const, restrict, volatile).
This seems true based on doc [1].

Without proper handling DW_TAG_atomic_type, llvm BTF generation will be incorrect since
the current implementation assumes no existence of DW_TAG_atomic_type. So we have
two choices here:
  (1). llvm bpf backend processes DW_TAG_atomic_type but ignores it in BTF encoding.
  (2). Add another type, e.g., BTF_KIND_ATOMIC to BTF. BTF_KIND_ATOMIC behaves as a
       modifier like const/volatile/restrict.

For choice (1), llvm bpf backend should skip dwarf::DW_TAG_atomic_type during
BTF generation whenever necessary.

For choice (2), BTF_KIND_ATOMIC will be added to BTF so llvm backend and kernel
needs to handle that properly. The main advantage of it probably is to maintain
this atomic type so it is also available to skeleton. But I think for skeleton
a raw type might be good enough unless user space intends to do some atomic
operation with that, which is a unlikely case.

So I choose choice (1) in this implementation.

 [1] https://lore.kernel.org/bpf/7b941f53-2a05-48ec-9032-8f106face3a3@linux.dev/
 [2] https://dwarfstd.org/issues/131112.1.html
---
 clang/lib/Basic/Targets/BPF.cpp       |   1 +
 llvm/lib/Target/BPF/BPFInstrInfo.td   | 134 ++++++++++++++++++++++----
 llvm/lib/Target/BPF/BPFMIChecking.cpp |   6 +-
 3 files changed, 120 insertions(+), 21 deletions(-)

diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp
index 931f407ecb0d7e..f4684765b7ffb3 100644
--- a/clang/lib/Basic/Targets/BPF.cpp
+++ b/clang/lib/Basic/Targets/BPF.cpp
@@ -38,6 +38,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   Builder.defineMacro("__BPF_FEATURE_ADDR_SPACE_CAST");
   Builder.defineMacro("__BPF_FEATURE_MAY_GOTO");
+  Builder.defineMacro("__BPF_FEATURE_ATOMIC_MEM_ORDERING");
 
   if (CPU.empty())
     CPU = "v3";
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index f7e17901c7ed5e..62d6e25f83b59f 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -826,13 +826,12 @@ let Predicates = [BPFNoALU32] in {
 }
 
 // Atomic Fetch-and-<add, and, or, xor> operations
-class XFALU64<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
-              string OpcStr, PatFrag OpNode>
+class XFALU64<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr, string OpcStr>
     : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
                  (outs GPR:$dst),
                  (ins MEMri:$addr, GPR:$val),
                  "$dst = atomic_fetch_"#OpcStr#"(("#OpcodeStr#" *)($addr), $val)",
-                 [(set GPR:$dst, (OpNode ADDRri:$addr, GPR:$val))]> {
+                 []> {
   bits<4> dst;
   bits<20> addr;
 
@@ -844,13 +843,12 @@ class XFALU64<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
   let BPFClass = BPF_STX;
 }
 
-class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
-              string OpcStr, PatFrag OpNode>
+class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr, string OpcStr>
     : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
                  (outs GPR32:$dst),
                  (ins MEMri:$addr, GPR32:$val),
                  "$dst = atomic_fetch_"#OpcStr#"(("#OpcodeStr#" *)($addr), $val)",
-                 [(set GPR32:$dst, (OpNode ADDRri:$addr, GPR32:$val))]> {
+                 []> {
   bits<4> dst;
   bits<20> addr;
 
@@ -864,26 +862,122 @@ class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
 
 let Constraints = "$dst = $val" in {
   let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
-    def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32>;
-    def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_i32>;
-    def XFORW32  : XFALU32<BPF_W, BPF_OR,  "u32", "or",  atomic_load_or_i32>;
-    def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_i32>;
+    def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add">;
+    def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and">;
+    def XFORW32  : XFALU32<BPF_W, BPF_OR,  "u32", "or">;
+    def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor">;
   }
 
   let Predicates = [BPFHasALU32] in {
-    def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64>;
+    def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add">;
   }
-  def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_i64>;
-  def XFORD  : XFALU64<BPF_DW, BPF_OR,  "u64", "or",  atomic_load_or_i64>;
-  def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_i64>;
+  def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and">;
+  def XFORD  : XFALU64<BPF_DW, BPF_OR,  "u64", "or">;
+  def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor">;
 }
 
-// atomic_load_sub can be represented as a neg followed
-// by an atomic_load_add.
-def : Pat<(atomic_load_sub_i32 ADDRri:$addr, GPR32:$val),
-          (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
-def : Pat<(atomic_load_sub_i64 ADDRri:$addr, GPR:$val),
-          (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+let Predicates = [BPFHasALU32] in {
+    foreach P = [// add
+                 [atomic_load_add_i32_monotonic,  XADDW32],
+                 [atomic_load_add_i32_acquire,   XFADDW32],
+                 [atomic_load_add_i32_release,   XFADDW32],
+                 [atomic_load_add_i32_acq_rel,   XFADDW32],
+                 [atomic_load_add_i32_seq_cst,   XFADDW32],
+                 // and
+                 [atomic_load_and_i32_monotonic,  XANDW32],
+                 [atomic_load_and_i32_acquire,   XFANDW32],
+                 [atomic_load_and_i32_release,   XFANDW32],
+                 [atomic_load_and_i32_acq_rel,   XFANDW32],
+                 [atomic_load_and_i32_seq_cst,   XFANDW32],
+                 // or
+                 [atomic_load_or_i32_monotonic,   XORW32],
+                 [atomic_load_or_i32_acquire,    XFORW32],
+                 [atomic_load_or_i32_release,    XFORW32],
+                 [atomic_load_or_i32_acq_rel,    XFORW32],
+                 [atomic_load_or_i32_seq_cst,    XFORW32],
+                 // xor
+                 [atomic_load_xor_i32_monotonic,  XXORW32],
+                 [atomic_load_xor_i32_acquire,   XFXORW32],
+                 [atomic_load_xor_i32_release,   XFXORW32],
+                 [atomic_load_xor_i32_acq_rel,   XFXORW32],
+                 [atomic_load_xor_i32_seq_cst,   XFXORW32],
+                ] in {
+      def : Pat<(P[0] ADDRri:$addr, GPR32:$val), (P[1]  ADDRri:$addr, GPR32:$val)>;
+    }
+
+    // atomic_load_sub can be represented as a neg followed
+    // by an atomic_load_add.
+    foreach P = [[atomic_load_sub_i32_monotonic,  XADDW32],
+                 [atomic_load_sub_i32_acquire,   XFADDW32],
+                 [atomic_load_sub_i32_release,   XFADDW32],
+                 [atomic_load_sub_i32_acq_rel,   XFADDW32],
+                 [atomic_load_sub_i32_seq_cst,   XFADDW32],
+                ] in {
+      def : Pat<(P[0] ADDRri:$addr, GPR32:$val), (P[1]  ADDRri:$addr, (NEG_32 GPR32:$val))>;
+    }
+
+    foreach P = [// add
+                 [atomic_load_add_i64_monotonic,  XADDD],
+                 [atomic_load_add_i64_acquire,   XFADDD],
+                 [atomic_load_add_i64_release,   XFADDD],
+                 [atomic_load_add_i64_acq_rel,   XFADDD],
+                 [atomic_load_add_i64_seq_cst,   XFADDD],
+                ] in {
+      def : Pat<(P[0] ADDRri:$addr, GPR:$val), (P[1]  ADDRri:$addr, GPR:$val)>;
+    }
+}
+
+foreach P = [[atomic_load_sub_i64_monotonic,  XADDD],
+             [atomic_load_sub_i64_acquire,   XFADDD],
+             [atomic_load_sub_i64_release,   XFADDD],
+             [atomic_load_sub_i64_acq_rel,   XFADDD],
+             [atomic_load_sub_i64_seq_cst,   XFADDD],
+            ] in {
+  def : Pat<(P[0] ADDRri:$addr, GPR:$val), (P[1]  ADDRri:$addr, (NEG_64 GPR:$val))>;
+}
+
+// Borrow the idea from X86InstrFragments.td
+class binop_no_use<SDPatternOperator operator>
+      : PatFrag<(ops node:$A, node:$B),
+                (operator node:$A, node:$B),
+                [{ return SDValue(N, 0).use_empty(); }]>;
+
+class binop_has_use<SDPatternOperator operator>
+      : PatFrag<(ops node:$A, node:$B),
+                (operator node:$A, node:$B),
+                [{ return !SDValue(N, 0).use_empty(); }]>;
+
+foreach op = [add, and, or, xor] in {
+def atomic_load_ # op # _i64_monotonic_nu:
+    binop_no_use <!cast<SDPatternOperator>("atomic_load_"#op# _i64_monotonic)>;
+def atomic_load_ # op # _i64_monotonic_hu:
+    binop_has_use<!cast<SDPatternOperator>("atomic_load_"#op# _i64_monotonic)>;
+}
+
+foreach P = [// and
+             [atomic_load_and_i64_monotonic_nu, XANDD],
+             [atomic_load_and_i64_monotonic_hu, XFANDD],
+             [atomic_load_and_i64_acquire,   XFANDD],
+             [atomic_load_and_i64_release,   XFANDD],
+             [atomic_load_and_i64_acq_rel,   XFANDD],
+             [atomic_load_and_i64_seq_cst,   XFANDD],
+             // or
+             [atomic_load_or_i64_monotonic_nu, XORD],
+             [atomic_load_or_i64_monotonic_hu, XFORD],
+             [atomic_load_or_i64_acquire,    XFORD],
+             [atomic_load_or_i64_release,    XFORD],
+             [atomic_load_or_i64_acq_rel,    XFORD],
+             [atomic_load_or_i64_seq_cst,    XFORD],
+             // xor
+             [atomic_load_xor_i64_monotonic_nu, XXORD],
+             [atomic_load_xor_i64_monotonic_hu, XFXORD],
+             [atomic_load_xor_i64_acquire,   XFXORD],
+             [atomic_load_xor_i64_release,   XFXORD],
+             [atomic_load_xor_i64_acq_rel,   XFXORD],
+             [atomic_load_xor_i64_seq_cst,   XFXORD],
+            ] in {
+  def : Pat<(P[0] ADDRri:$addr, GPR:$val), (P[1]  ADDRri:$addr, GPR:$val)>;
+}
 
 // Atomic Exchange
 class XCHG<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp
index 24224f6c1e9e66..09635dbba17607 100644
--- a/llvm/lib/Target/BPF/BPFMIChecking.cpp
+++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp
@@ -118,7 +118,7 @@ static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
 
     RegIsGPR64 = GPR64RegClass->contains(MO.getReg());
     if (!MO.isDead()) {
-      // It is a GPR64 live Def, we are sure it is live. */
+      // It is a GPR64 live Def, we are sure it is live.
       if (RegIsGPR64)
         return true;
       // It is a GPR32 live Def, we are unsure whether it is really dead due to
@@ -153,6 +153,10 @@ static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
 }
 
 void BPFMIPreEmitChecking::processAtomicInsts() {
+  if (MF->getSubtarget<BPFSubtarget>().getHasJmp32())
+    return;
+
+  // Only check for cpu version 1 and 2.
   for (MachineBasicBlock &MBB : *MF) {
     for (MachineInstr &MI : MBB) {
       if (MI.getOpcode() != BPF::XADDW && MI.getOpcode() != BPF::XADDD)

>From f9b8eafa3ea775e0fa3defc3e4add4fc3918b529 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Mon, 9 Sep 2024 11:04:17 -0700
Subject: [PATCH 2/3] [BPF] Handle DW_TAG_atomic_type properly

Make change in BTFDebug.cpp to handle DW_TAG_atomic_type properly.
Otherwise, a type like
   _Atomic int i; // global
the dwarf type chain atomic->int
Since DW_TAG_atomic_type is not processed BTF generation will stop
at atomic modifier and BTF will encode 'i' as void type.

Similar for type like
  volatile _Atomic int *p;
the dwarf type chain ptr->volatile->atomic->int
Since atomic type is not processed and BTF generation will stop at
atomic type, the eventual BTF type will be
  ptr->volatile->void
which is incorrect.

This patch fixed the following cases including the above two patterns
by skipping DW_TAG_atomic_type:
  - global variable with _Atomic type.
  - function parameter and return type with _Atomic type.
  - struct member with _Atomic type.
  - ptr,const,volatile,restrict pointing to a _Atomic type.
  - btf_type_tag where ptr pointing to _Atomic type and btf_type_tag.

With changed llvm, in kernel selftest arena_atomics.c ([1]), the new bpf
code looks like

```
_Atomic __u64 __arena_global and64_value = (0x110ull << 32);
_Atomic __u32 __arena_global and32_value = 0x110;

SEC("raw_tp/sys_enter")
int and(const void *ctx)
{
	...
        __c11_atomic_fetch_and(&and64_value, 0x011ull << 32, memory_order_relaxed);
        __c11_atomic_fetch_and(&and32_value, 0x011, memory_order_relaxed);
	...

        return 0;
}
```
and compilation is successful.

The skel file arena_atomics.skel.h will be
```
struct arena_atomics__arena {
	...
	__u64 and64_value;
	__u32 and32_value;
	...
} *arena;
```

  [1] https://lore.kernel.org/r/20240909223431.1666305-1-yonghong.song@linux.dev
---
 clang/lib/CodeGen/CGDebugInfo.cpp |  6 +++++-
 llvm/lib/Target/BPF/BTFDebug.cpp  | 29 ++++++++++++++++++++++-------
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 2d2c280941bd64..4782e80f221772 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1249,8 +1249,12 @@ llvm::DIType *CGDebugInfo::CreatePointerLikeType(llvm::dwarf::Tag Tag,
       CGM.getTarget().getDWARFAddressSpace(
           CGM.getTypes().getTargetAddressSpace(PointeeTy));
 
+  const BTFTagAttributedType *BTFAttrTy;
+  if (auto *Atomic = PointeeTy->getAs<AtomicType>())
+    BTFAttrTy = dyn_cast<BTFTagAttributedType>(Atomic->getValueType());
+  else
+    BTFAttrTy = dyn_cast<BTFTagAttributedType>(PointeeTy);
   SmallVector<llvm::Metadata *, 4> Annots;
-  auto *BTFAttrTy = dyn_cast<BTFTagAttributedType>(PointeeTy);
   while (BTFAttrTy) {
     StringRef Tag = BTFAttrTy->getAttr()->getBTFTypeTag();
     if (!Tag.empty()) {
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 4d847abea731dc..9d6dee13ca97a9 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -35,6 +35,15 @@ static const char *BTFKindStr[] = {
 #include "llvm/DebugInfo/BTF/BTF.def"
 };
 
+static const DIType *tryRemoveAtomicType(const DIType *Ty) {
+  if (!Ty)
+    return Ty;
+  auto DerivedTy = dyn_cast<DIDerivedType>(Ty);
+  if (DerivedTy && DerivedTy->getTag() == dwarf::DW_TAG_atomic_type)
+    return DerivedTy->getBaseType();
+  return Ty;
+}
+
 /// Emit a BTF common type.
 void BTFTypeBase::emitType(MCStreamer &OS) {
   OS.AddComment(std::string(BTFKindStr[Kind]) + "(id = " + std::to_string(Id) +
@@ -90,7 +99,7 @@ void BTFTypeDerived::completeType(BTFDebug &BDebug) {
     return;
 
   // The base type for PTR/CONST/VOLATILE could be void.
-  const DIType *ResolvedType = DTy->getBaseType();
+  const DIType *ResolvedType = tryRemoveAtomicType(DTy->getBaseType());
   if (!ResolvedType) {
     assert((Kind == BTF::BTF_KIND_PTR || Kind == BTF::BTF_KIND_CONST ||
             Kind == BTF::BTF_KIND_VOLATILE) &&
@@ -305,7 +314,7 @@ void BTFTypeStruct::completeType(BTFDebug &BDebug) {
     } else {
       BTFMember.Offset = DDTy->getOffsetInBits();
     }
-    const auto *BaseTy = DDTy->getBaseType();
+    const auto *BaseTy = tryRemoveAtomicType(DDTy->getBaseType());
     BTFMember.Type = BDebug.getTypeId(BaseTy);
     Members.push_back(BTFMember);
   }
@@ -342,7 +351,7 @@ void BTFTypeFuncProto::completeType(BTFDebug &BDebug) {
   IsCompleted = true;
 
   DITypeRefArray Elements = STy->getTypeArray();
-  auto RetType = Elements[0];
+  auto RetType = tryRemoveAtomicType(Elements[0]);
   BTFType.Type = RetType ? BDebug.getTypeId(RetType) : 0;
   BTFType.NameOff = 0;
 
@@ -350,7 +359,7 @@ void BTFTypeFuncProto::completeType(BTFDebug &BDebug) {
   // to represent the vararg, encode the NameOff/Type to be 0.
   for (unsigned I = 1, N = Elements.size(); I < N; ++I) {
     struct BTF::BTFParam Param;
-    auto Element = Elements[I];
+    auto Element = tryRemoveAtomicType(Elements[I]);
     if (Element) {
       Param.NameOff = BDebug.addString(FuncArgNames[I]);
       Param.Type = BDebug.getTypeId(Element);
@@ -483,7 +492,7 @@ void BTFTypeTypeTag::completeType(BTFDebug &BDebug) {
   IsCompleted = true;
   BTFType.NameOff = BDebug.addString(Tag);
   if (DTy) {
-    const DIType *ResolvedType = DTy->getBaseType();
+    const DIType *ResolvedType = tryRemoveAtomicType(DTy->getBaseType());
     if (!ResolvedType)
       BTFType.Type = 0;
     else
@@ -800,6 +809,10 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
                                 bool CheckPointer, bool SeenPointer) {
   unsigned Tag = DTy->getTag();
 
+  if (Tag == dwarf::DW_TAG_atomic_type)
+    return visitTypeEntry(DTy->getBaseType(), TypeId, CheckPointer,
+                          SeenPointer);
+
   /// Try to avoid chasing pointees, esp. structure pointees which may
   /// unnecessary bring in a lot of types.
   if (CheckPointer && !SeenPointer) {
@@ -1444,8 +1457,10 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
       DIGlobal = GVE->getVariable();
       if (SecName.starts_with(".maps"))
         visitMapDefType(DIGlobal->getType(), GVTypeId);
-      else
-        visitTypeEntry(DIGlobal->getType(), GVTypeId, false, false);
+      else {
+        const DIType *Ty = tryRemoveAtomicType(DIGlobal->getType());
+        visitTypeEntry(Ty, GVTypeId, false, false);
+      }
       break;
     }
 

>From 4fda286a258ddd7022e1059b63d6e1589fe453ae Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Fri, 13 Sep 2024 09:51:35 -0700
Subject: [PATCH 3/3] [BPF] Add functionality/btf selftests for memory ordering
 cases

The following test cases are added:
  - all memory ordering and its asm codes with -mcpu=v3
  - all memory ordering and its asm codes with -mcpu=v1
    Note that __c11_atomic_fetch_{sub,and,or,xor} for 32bit won't
    work for -mcpu=v1. Also at -mcpu=v1, no return value
    allowed for 64bit __sync_fetch_and_add.
  - at -mcpu=v1, __c11_atomic_fetch_sub() for 64bit with relaxed
    memory ordering, the xaddd insn will be used so return
    value is not supported. Otherwise, it will work fine
    if return value is not used. This aligns to
    __c11_atomic_fetch_add() for 64bit with relaxed memory
    ordering at -mcpu=v1.
  - BTF test with _Atomic types in different cases.
---
 clang/test/CodeGen/bpf-attr-type-tag-atomic.c |  16 +
 llvm/test/CodeGen/BPF/BTF/atomics.ll          | 151 ++++
 llvm/test/CodeGen/BPF/BTF/print_btf.py        | 295 +++++++
 llvm/test/CodeGen/BPF/atomics_mem_order_v1.ll | 385 +++++++++
 llvm/test/CodeGen/BPF/atomics_mem_order_v3.ll | 781 ++++++++++++++++++
 .../CodeGen/BPF/atomics_sub64_relaxed_v1.ll   |  27 +
 llvm/test/CodeGen/BPF/xaddd_v1.ll             |  25 +
 7 files changed, 1680 insertions(+)
 create mode 100644 clang/test/CodeGen/bpf-attr-type-tag-atomic.c
 create mode 100644 llvm/test/CodeGen/BPF/BTF/atomics.ll
 create mode 100644 llvm/test/CodeGen/BPF/BTF/print_btf.py
 create mode 100644 llvm/test/CodeGen/BPF/atomics_mem_order_v1.ll
 create mode 100644 llvm/test/CodeGen/BPF/atomics_mem_order_v3.ll
 create mode 100644 llvm/test/CodeGen/BPF/atomics_sub64_relaxed_v1.ll
 create mode 100644 llvm/test/CodeGen/BPF/xaddd_v1.ll

diff --git a/clang/test/CodeGen/bpf-attr-type-tag-atomic.c b/clang/test/CodeGen/bpf-attr-type-tag-atomic.c
new file mode 100644
index 00000000000000..a10a45dc0808d1
--- /dev/null
+++ b/clang/test/CodeGen/bpf-attr-type-tag-atomic.c
@@ -0,0 +1,16 @@
+// REQUIRES: bpf-registered-target
+// RUN: %clang_cc1 -triple bpf -emit-llvm -disable-llvm-passes -debug-info-kind=limited %s -o - | FileCheck %s
+
+#define __tag1 __attribute__((btf_type_tag("tag1")))
+int _Atomic __tag1 *g1;
+volatile int _Atomic __tag1 *g2;
+
+// CHECK: distinct !DIGlobalVariable(name: "g1", scope: ![[#]], file: ![[#]], line: [[#]], type: ![[PTR1:[0-9]+]]
+// CHECK: distinct !DIGlobalVariable(name: "g2", scope: ![[#]], file: ![[#]], line: [[#]], type: ![[PTR2:[0-9]+]]
+// CHECK: ![[PTR2]]  = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[BASE2:[0-9]+]], size: [[#]], annotations: ![[ANNOT:[0-9]+]])
+// CHECK: ![[BASE2]] = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: ![[BASE1:[0-9]+]])
+// CHECK: ![[BASE1]] = !DIDerivedType(tag: DW_TAG_atomic_type, baseType: ![[BASIC:[0-9]+]])
+// CHECK: ![[BASIC]] = !DIBasicType(name: "int", size: [[#]], encoding: DW_ATE_signed)
+// CHECK: ![[ANNOT]] = !{![[ENTRY:[0-9]+]]}
+// CHECK: ![[ENTRY]] = !{!"btf_type_tag", !"tag1"}
+// CHECK: ![[PTR1]]  = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[BASE1]], size: [[#]], annotations: ![[ANNOT]])
diff --git a/llvm/test/CodeGen/BPF/BTF/atomics.ll b/llvm/test/CodeGen/BPF/BTF/atomics.ll
new file mode 100644
index 00000000000000..2c02110f24c0d1
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/atomics.ll
@@ -0,0 +1,151 @@
+; RUN: llc -march=bpfel -mcpu=v3 -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK %s
+;
+; Source:
+;   #include <stdatomic.h>
+;   struct gstruct_t {
+;     _Atomic int a;
+;   } gstruct;
+;   extern _Atomic int ext;
+;   _Atomic int gbl;
+;   _Atomic int *pgbl;
+;   volatile _Atomic int vvar;
+;   _Atomic int __attribute__((btf_type_tag("foo"))) *tagptr1;
+;   volatile __attribute__((btf_type_tag("foo"))) _Atomic int *tagptr2;
+;   _Atomic int foo(_Atomic int a1, _Atomic int *p1) {
+;     (void)__c11_atomic_fetch_add(&gstruct.a, 1, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_add(&ext, 1, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_add(&gbl, 1, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_add(pgbl, 1, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_add(&vvar, 1, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_add(p1, 1, memory_order_relaxed);
+;
+;     return a1;
+;   }
+
+target triple = "bpf"
+
+%struct.gstruct_t = type { i32 }
+
+ at gstruct = dso_local global %struct.gstruct_t zeroinitializer, align 4, !dbg !0
+ at ext = external dso_local global i32, align 4, !dbg !34
+ at gbl = dso_local global i32 0, align 4, !dbg !16
+ at pgbl = dso_local local_unnamed_addr global ptr null, align 8, !dbg !20
+ at vvar = dso_local global i32 0, align 4, !dbg !23
+ at tagptr1 = dso_local local_unnamed_addr global ptr null, align 8, !dbg !26
+ at tagptr2 = dso_local local_unnamed_addr global ptr null, align 8, !dbg !31
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn
+define dso_local i32 @foo(i32 returned %a1, ptr nocapture noundef %p1) local_unnamed_addr #0 !dbg !45 {
+entry:
+    #dbg_value(i32 %a1, !49, !DIExpression(), !51)
+    #dbg_value(ptr %p1, !50, !DIExpression(), !51)
+  %0 = atomicrmw add ptr @gstruct, i32 1 monotonic, align 4, !dbg !52
+  %1 = atomicrmw add ptr @ext, i32 1 monotonic, align 4, !dbg !53
+  %2 = atomicrmw add ptr @gbl, i32 1 monotonic, align 4, !dbg !54
+  %3 = load ptr, ptr @pgbl, align 8, !dbg !55, !tbaa !56
+  %4 = atomicrmw add ptr %3, i32 1 monotonic, align 4, !dbg !60
+  %5 = atomicrmw volatile add ptr @vvar, i32 1 monotonic, align 4, !dbg !61
+  %6 = atomicrmw add ptr %p1, i32 1 monotonic, align 4, !dbg !62
+  ret i32 %a1, !dbg !63
+}
+
+; CHECK:             [1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-NEXT:        [2] PTR '(anon)' type_id=1
+; CHECK-NEXT:        [3] FUNC_PROTO '(anon)' ret_type_id=1 vlen=2
+; CHECK-NEXT:         'a1' type_id=1
+; CHECK-NEXT:         'p1' type_id=2
+; CHECK-NEXT:        [4] FUNC 'foo' type_id=3 linkage=global
+; CHECK-NEXT:        [5] STRUCT 'gstruct_t' size=4 vlen=1
+; CHECK-NEXT:         'a' type_id=1 bits_offset=0
+; CHECK-NEXT:        [6] VAR 'gstruct' type_id=5, linkage=global
+; CHECK-NEXT:        [7] VAR 'ext' type_id=1, linkage=extern
+; CHECK-NEXT:        [8] VAR 'gbl' type_id=1, linkage=global
+; CHECK-NEXT:        [9] VAR 'pgbl' type_id=2, linkage=global
+; CHECK-NEXT:        [10] VOLATILE '(anon)' type_id=1
+; CHECK-NEXT:        [11] VAR 'vvar' type_id=10, linkage=global
+; CHECK-NEXT:        [12] TYPE_TAG 'foo' type_id=1
+; CHECK-NEXT:        [13] PTR '(anon)' type_id=12
+; CHECK-NEXT:        [14] VAR 'tagptr1' type_id=13, linkage=global
+; CHECK-NEXT:        [15] TYPE_TAG 'foo' type_id=10
+; CHECK-NEXT:        [16] PTR '(anon)' type_id=15
+; CHECK-NEXT:        [17] VAR 'tagptr2' type_id=16, linkage=global
+; CHECK-NEXT:        [18] DATASEC '.bss' size=0 vlen=6
+; CHECK-NEXT:         type_id=6 offset=0 size=4
+; CHECK-NEXT:         type_id=8 offset=0 size=4
+; CHECK-NEXT:         type_id=9 offset=0 size=8
+; CHECK-NEXT:         type_id=11 offset=0 size=4
+; CHECK-NEXT:         type_id=14 offset=0 size=8
+; CHECK-NEXT:         type_id=17 offset=0 size=8
+
+attributes #0 = { mustprogress nofree norecurse nounwind willreturn "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!39, !40, !41, !42, !43}
+!llvm.ident = !{!44}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "gstruct", scope: !2, file: !3, line: 4, type: !36, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 20.0.0git (git at github.com:yonghong-song/llvm-project.git 96b5b6e527c024bea84f07ea11d4b3ff63468c22)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !15, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "test6.c", directory: "/tmp/home/yhs/tmp3", checksumkind: CSK_MD5, checksum: "e743f2985da6027dcc5e048bd1dcccca")
+!4 = !{!5}
+!5 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "memory_order", file: !6, line: 68, baseType: !7, size: 32, elements: !8)
+!6 = !DIFile(filename: "work/yhs/llvm-project/llvm/build/install/lib/clang/20/include/stdatomic.h", directory: "/home/yhs", checksumkind: CSK_MD5, checksum: "f17199a988fe91afffaf0f943ef87096")
+!7 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!8 = !{!9, !10, !11, !12, !13, !14}
+!9 = !DIEnumerator(name: "memory_order_relaxed", value: 0)
+!10 = !DIEnumerator(name: "memory_order_consume", value: 1)
+!11 = !DIEnumerator(name: "memory_order_acquire", value: 2)
+!12 = !DIEnumerator(name: "memory_order_release", value: 3)
+!13 = !DIEnumerator(name: "memory_order_acq_rel", value: 4)
+!14 = !DIEnumerator(name: "memory_order_seq_cst", value: 5)
+!15 = !{!0, !16, !20, !23, !26, !31, !34}
+!16 = !DIGlobalVariableExpression(var: !17, expr: !DIExpression())
+!17 = distinct !DIGlobalVariable(name: "gbl", scope: !2, file: !3, line: 6, type: !18, isLocal: false, isDefinition: true)
+!18 = !DIDerivedType(tag: DW_TAG_atomic_type, baseType: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DIGlobalVariableExpression(var: !21, expr: !DIExpression())
+!21 = distinct !DIGlobalVariable(name: "pgbl", scope: !2, file: !3, line: 7, type: !22, isLocal: false, isDefinition: true)
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !18, size: 64)
+!23 = !DIGlobalVariableExpression(var: !24, expr: !DIExpression())
+!24 = distinct !DIGlobalVariable(name: "vvar", scope: !2, file: !3, line: 8, type: !25, isLocal: false, isDefinition: true)
+!25 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !18)
+!26 = !DIGlobalVariableExpression(var: !27, expr: !DIExpression())
+!27 = distinct !DIGlobalVariable(name: "tagptr1", scope: !2, file: !3, line: 9, type: !28, isLocal: false, isDefinition: true)
+!28 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !18, size: 64, annotations: !29)
+!29 = !{!30}
+!30 = !{!"btf_type_tag", !"foo"}
+!31 = !DIGlobalVariableExpression(var: !32, expr: !DIExpression())
+!32 = distinct !DIGlobalVariable(name: "tagptr2", scope: !2, file: !3, line: 10, type: !33, isLocal: false, isDefinition: true)
+!33 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !25, size: 64, annotations: !29)
+!34 = !DIGlobalVariableExpression(var: !35, expr: !DIExpression())
+!35 = distinct !DIGlobalVariable(name: "ext", scope: !2, file: !3, line: 5, type: !18, isLocal: false, isDefinition: false)
+!36 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "gstruct_t", file: !3, line: 2, size: 32, elements: !37)
+!37 = !{!38}
+!38 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !36, file: !3, line: 3, baseType: !18, size: 32)
+!39 = !{i32 7, !"Dwarf Version", i32 5}
+!40 = !{i32 2, !"Debug Info Version", i32 3}
+!41 = !{i32 1, !"wchar_size", i32 4}
+!42 = !{i32 7, !"frame-pointer", i32 2}
+!43 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!44 = !{!"clang version 20.0.0git (git at github.com:yonghong-song/llvm-project.git 96b5b6e527c024bea84f07ea11d4b3ff63468c22)"}
+!45 = distinct !DISubprogram(name: "foo", scope: !3, file: !3, line: 11, type: !46, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !48)
+!46 = !DISubroutineType(types: !47)
+!47 = !{!18, !18, !22}
+!48 = !{!49, !50}
+!49 = !DILocalVariable(name: "a1", arg: 1, scope: !45, file: !3, line: 11, type: !18)
+!50 = !DILocalVariable(name: "p1", arg: 2, scope: !45, file: !3, line: 11, type: !22)
+!51 = !DILocation(line: 0, scope: !45)
+!52 = !DILocation(line: 12, column: 9, scope: !45)
+!53 = !DILocation(line: 13, column: 9, scope: !45)
+!54 = !DILocation(line: 14, column: 9, scope: !45)
+!55 = !DILocation(line: 15, column: 32, scope: !45)
+!56 = !{!57, !57, i64 0}
+!57 = !{!"any pointer", !58, i64 0}
+!58 = !{!"omnipotent char", !59, i64 0}
+!59 = !{!"Simple C/C++ TBAA"}
+!60 = !DILocation(line: 15, column: 9, scope: !45)
+!61 = !DILocation(line: 16, column: 9, scope: !45)
+!62 = !DILocation(line: 17, column: 9, scope: !45)
+!63 = !DILocation(line: 19, column: 3, scope: !45)
diff --git a/llvm/test/CodeGen/BPF/BTF/print_btf.py b/llvm/test/CodeGen/BPF/BTF/print_btf.py
new file mode 100644
index 00000000000000..6ce08b76c363e1
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/print_btf.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+
+# Ad-hoc script to print BTF file in a readable format.
+# Follows the same printing conventions as bpftool with format 'raw'.
+# Usage:
+#
+#   ./print_btf.py <btf_file>
+#
+# Parameters:
+#
+#   <btf_file> :: a file name or '-' to read from stdin.
+#
+# Intended usage:
+#
+#   llvm-objcopy --dump-section .BTF=- <input> | ./print_btf.py -
+#
+# Kernel documentation contains detailed format description:
+#   https://www.kernel.org/doc/html/latest/bpf/btf.html
+
+import struct
+import ctypes
+import sys
+
+
+class SafeDict(dict):
+    def __getitem__(self, key):
+        try:
+            return dict.__getitem__(self, key)
+        except KeyError:
+            return f"<BAD_KEY: {key}>"
+
+
+KINDS = SafeDict(
+    {
+        0: "UNKN",
+        1: "INT",
+        2: "PTR",
+        3: "ARRAY",
+        4: "STRUCT",
+        5: "UNION",
+        6: "ENUM",
+        7: "FWD",
+        8: "TYPEDEF",
+        9: "VOLATILE",
+        10: "CONST",
+        11: "RESTRICT",
+        12: "FUNC",
+        13: "FUNC_PROTO",
+        14: "VAR",
+        15: "DATASEC",
+        16: "FLOAT",
+        17: "DECL_TAG",
+        18: "TYPE_TAG",
+        19: "ENUM64",
+    }
+)
+
+INT_ENCODING = SafeDict(
+    {0 << 0: "(none)", 1 << 0: "SIGNED", 1 << 1: "CHAR", 1 << 2: "BOOL"}
+)
+
+ENUM_ENCODING = SafeDict({0: "UNSIGNED", 1: "SIGNED"})
+
+FUNC_LINKAGE = SafeDict({0: "static", 1: "global", 2: "extern"})
+
+VAR_LINKAGE = SafeDict({0: "static", 1: "global", 2: "extern"})
+
+FWD_KIND = SafeDict(
+    {
+        0: "struct",
+        1: "union",
+    }
+)
+
+for val, name in KINDS.items():
+    globals()["BTF_KIND_" + name] = val
+
+
+def warn(message):
+    print(message, file=sys.stderr)
+
+
+def print_btf(filename):
+    if filename == "-":
+        buf = sys.stdin.buffer.read()
+    else:
+        with open(filename, "rb") as file:
+            buf = file.read()
+
+    fmt_cache = {}
+    endian_pfx = ""
+    off = 0
+
+    def unpack(fmt):
+        nonlocal off, endian_pfx
+        fmt = endian_pfx + fmt
+        if fmt not in fmt_cache:
+            fmt_cache[fmt] = struct.Struct(fmt)
+        st = fmt_cache[fmt]
+        r = st.unpack_from(buf, off)
+        off += st.size
+        return r
+
+    # Use magic number at the header start to determine endianness
+    (magic,) = unpack("H")
+    if magic == 0xEB9F:
+        endian_pfx = "<"
+    elif magic == 0x9FEB:
+        endian_pfx = ">"
+    else:
+        warn(f"Unexpected BTF magic: {magic:02x}")
+        return
+
+    # Rest of the header
+    version, flags, hdr_len = unpack("BBI")
+    type_off, type_len, str_off, str_len = unpack("IIII")
+
+    # Offsets in the header are relative to the end of a header
+    type_off += hdr_len
+    str_off += hdr_len
+    off = hdr_len
+    type_end = type_off + type_len
+
+    def string(rel_off):
+        try:
+            start = str_off + rel_off
+            end = buf.index(b"\0", start)
+            if start == end:
+                return "(anon)"
+            return buf[start:end].decode("utf8")
+        except ValueError as e:
+            warn(f"Can't get string at offset {str_off} + {rel_off}: {e}")
+            return f"<BAD_STRING {rel_off}>"
+
+    idx = 1
+    while off < type_end:
+        name_off, info, size = unpack("III")
+        kind = (info >> 24) & 0x1F
+        vlen = info & 0xFFFF
+        kflag = info >> 31
+        kind_name = KINDS[kind]
+        name = string(name_off)
+
+        def warn_nonzero(val, name):
+            nonlocal idx
+            if val != 0:
+                warn(f"<{idx}> {name} should be 0 but is {val}")
+
+        if kind == BTF_KIND_INT:
+            (info,) = unpack("I")
+            encoding = (info & 0x0F000000) >> 24
+            offset = (info & 0x00FF0000) >> 16
+            bits = info & 0x000000FF
+            enc_name = INT_ENCODING[encoding]
+            print(
+                f"[{idx}] {kind_name} '{name}' size={size} "
+                f"bits_offset={offset} "
+                f"nr_bits={bits} encoding={enc_name}"
+            )
+            warn_nonzero(kflag, "kflag")
+            warn_nonzero(vlen, "vlen")
+
+        elif kind in [
+            BTF_KIND_PTR,
+            BTF_KIND_CONST,
+            BTF_KIND_VOLATILE,
+            BTF_KIND_RESTRICT,
+        ]:
+            print(f"[{idx}] {kind_name} '{name}' type_id={size}")
+            warn_nonzero(name_off, "name_off")
+            warn_nonzero(kflag, "kflag")
+            warn_nonzero(vlen, "vlen")
+
+        elif kind == BTF_KIND_ARRAY:
+            warn_nonzero(name_off, "name_off")
+            warn_nonzero(kflag, "kflag")
+            warn_nonzero(vlen, "vlen")
+            warn_nonzero(size, "size")
+            type, index_type, nelems = unpack("III")
+            print(
+                f"[{idx}] {kind_name} '{name}' type_id={type} "
+                f"index_type_id={index_type} nr_elems={nelems}"
+            )
+
+        elif kind in [BTF_KIND_STRUCT, BTF_KIND_UNION]:
+            print(f"[{idx}] {kind_name} '{name}' size={size} vlen={vlen}")
+            if kflag not in [0, 1]:
+                warn(f"<{idx}> kflag should 0 or 1: {kflag}")
+            for _ in range(0, vlen):
+                name_off, type, offset = unpack("III")
+                if kflag == 0:
+                    print(
+                        f"\t'{string(name_off)}' type_id={type} "
+                        f"bits_offset={offset}"
+                    )
+                else:
+                    bits_offset = offset & 0xFFFFFF
+                    bitfield_size = offset >> 24
+                    print(
+                        f"\t'{string(name_off)}' type_id={type} "
+                        f"bits_offset={bits_offset} "
+                        f"bitfield_size={bitfield_size}"
+                    )
+
+        elif kind == BTF_KIND_ENUM:
+            encoding = ENUM_ENCODING[kflag]
+            print(
+                f"[{idx}] {kind_name} '{name}' encoding={encoding} "
+                f"size={size} vlen={vlen}"
+            )
+            for _ in range(0, vlen):
+                (name_off,) = unpack("I")
+                (val,) = unpack("i" if kflag == 1 else "I")
+                print(f"\t'{string(name_off)}' val={val}")
+
+        elif kind == BTF_KIND_ENUM64:
+            encoding = ENUM_ENCODING[kflag]
+            print(
+                f"[{idx}] {kind_name} '{name}' encoding={encoding} "
+                f"size={size} vlen={vlen}"
+            )
+            for _ in range(0, vlen):
+                name_off, lo, hi = unpack("III")
+                val = hi << 32 | lo
+                if kflag == 1:
+                    val = ctypes.c_long(val).value
+                print(f"\t'{string(name_off)}' val={val}LL")
+
+        elif kind == BTF_KIND_FWD:
+            print(f"[{idx}] {kind_name} '{name}' fwd_kind={FWD_KIND[kflag]}")
+            warn_nonzero(vlen, "vlen")
+            warn_nonzero(size, "size")
+
+        elif kind in [BTF_KIND_TYPEDEF, BTF_KIND_TYPE_TAG]:
+            print(f"[{idx}] {kind_name} '{name}' type_id={size}")
+            warn_nonzero(kflag, "kflag")
+            warn_nonzero(kflag, "vlen")
+
+        elif kind == BTF_KIND_FUNC:
+            linkage = FUNC_LINKAGE[vlen]
+            print(f"[{idx}] {kind_name} '{name}' type_id={size} " f"linkage={linkage}")
+            warn_nonzero(kflag, "kflag")
+
+        elif kind == BTF_KIND_FUNC_PROTO:
+            print(f"[{idx}] {kind_name} '{name}' ret_type_id={size} " f"vlen={vlen}")
+            warn_nonzero(name_off, "name_off")
+            warn_nonzero(kflag, "kflag")
+            for _ in range(0, vlen):
+                name_off, type = unpack("II")
+                print(f"\t'{string(name_off)}' type_id={type}")
+
+        elif kind == BTF_KIND_VAR:
+            (linkage,) = unpack("I")
+            linkage = VAR_LINKAGE[linkage]
+            print(f"[{idx}] {kind_name} '{name}' type_id={size}, " f"linkage={linkage}")
+            warn_nonzero(kflag, "kflag")
+            warn_nonzero(vlen, "vlen")
+
+        elif kind == BTF_KIND_DATASEC:
+            print(f"[{idx}] {kind_name} '{name}' size={size} vlen={vlen}")
+            warn_nonzero(kflag, "kflag")
+            warn_nonzero(size, "size")
+            for _ in range(0, vlen):
+                type, offset, size = unpack("III")
+                print(f"\ttype_id={type} offset={offset} size={size}")
+
+        elif kind == BTF_KIND_FLOAT:
+            print(f"[{idx}] {kind_name} '{name}' size={size}")
+            warn_nonzero(kflag, "kflag")
+            warn_nonzero(vlen, "vlen")
+
+        elif kind == BTF_KIND_DECL_TAG:
+            (component_idx,) = unpack("i")
+            print(
+                f"[{idx}] {kind_name} '{name}' type_id={size} "
+                + f"component_idx={component_idx}"
+            )
+            warn_nonzero(kflag, "kflag")
+            warn_nonzero(vlen, "vlen")
+
+        else:
+            warn(
+                f"<{idx}> Unexpected entry: kind={kind_name} "
+                f"name_off={name_off} "
+                f"vlen={vlen} kflag={kflag} size={size}"
+            )
+
+        idx += 1
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        warn("Usage: {sys.argv[0]} <btf_file>")
+        sys.exit(1)
+    print_btf(sys.argv[1])
diff --git a/llvm/test/CodeGen/BPF/atomics_mem_order_v1.ll b/llvm/test/CodeGen/BPF/atomics_mem_order_v1.ll
new file mode 100644
index 00000000000000..31081586bf7afc
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/atomics_mem_order_v1.ll
@@ -0,0 +1,385 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=bpfel -mcpu=v1 -filetype=asm < %s | FileCheck %s
+;
+; Source:
+; $ cat atomics_mem_order_v1.c
+;   #include <stdatomic.h>
+;
+;   void test_fetch_add_32_noret(int _Atomic *i) {
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_add_64_noret(long _Atomic *i) {
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_sub_64_noret(long _Atomic *i) {
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_seq_cst);
+;   }
+;
+;   long test_fetch_sub_64_ret(long _Atomic *i) {
+;      return __c11_atomic_fetch_sub(i, 10, memory_order_acquire) +
+;             __c11_atomic_fetch_sub(i, 10, memory_order_release) +
+;             __c11_atomic_fetch_sub(i, 10, memory_order_acq_rel) +
+;             __c11_atomic_fetch_sub(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_and_64_noret(long _Atomic *i) {
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
+;   }
+;
+;   long test_fetch_and_64_ret(long _Atomic *i) {
+;     return __c11_atomic_fetch_and(i, 10, memory_order_relaxed) +
+;            __c11_atomic_fetch_and(i, 10, memory_order_acquire) +
+;            __c11_atomic_fetch_and(i, 10, memory_order_release) +
+;            __c11_atomic_fetch_and(i, 10, memory_order_acq_rel) +
+;            __c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_or_64_noret(long _Atomic *i) {
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_seq_cst);
+;   }
+;
+;   long test_fetch_or_64_ret(long _Atomic *i) {
+;     return __c11_atomic_fetch_or(i, 10, memory_order_relaxed) +
+;            __c11_atomic_fetch_or(i, 10, memory_order_acquire) +
+;            __c11_atomic_fetch_or(i, 10, memory_order_release) +
+;            __c11_atomic_fetch_or(i, 10, memory_order_acq_rel) +
+;            __c11_atomic_fetch_or(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_xor_64_noret(long _Atomic *i) {
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_seq_cst);
+;   }
+;
+;   long test_fetch_xor_64_ret(long _Atomic *i) {
+;     return __c11_atomic_fetch_xor(i, 10, memory_order_relaxed) +
+;            __c11_atomic_fetch_xor(i, 10, memory_order_acquire) +
+;            __c11_atomic_fetch_xor(i, 10, memory_order_release) +
+;            __c11_atomic_fetch_xor(i, 10, memory_order_acq_rel) +
+;            __c11_atomic_fetch_xor(i, 10, memory_order_seq_cst);
+;   }
+
+target triple = "bpf"
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_add_32_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_add_32_noret:
+; CHECK:       .Ltest_fetch_add_32_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_add_32_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) += r3
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw add ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw add ptr %i, i32 10 acquire, align 4
+  %2 = atomicrmw add ptr %i, i32 10 release, align 4
+  %3 = atomicrmw add ptr %i, i32 10 acq_rel, align 4
+  %4 = atomicrmw add ptr %i, i32 10 seq_cst, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_add_64_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_add_64_noret:
+; CHECK:       .Ltest_fetch_add_64_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_add_64_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) += r3
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw add ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw add ptr %i, i64 10 acquire, align 8
+  %2 = atomicrmw add ptr %i, i64 10 release, align 8
+  %3 = atomicrmw add ptr %i, i64 10 acq_rel, align 8
+  %4 = atomicrmw add ptr %i, i64 10 seq_cst, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_sub_64_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_sub_64_noret:
+; CHECK:       .Ltest_fetch_sub_64_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_sub_64_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r2 = -r2
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) += r3
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw sub ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw sub ptr %i, i64 10 acquire, align 8
+  %2 = atomicrmw sub ptr %i, i64 10 release, align 8
+  %3 = atomicrmw sub ptr %i, i64 10 acq_rel, align 8
+  %4 = atomicrmw sub ptr %i, i64 10 seq_cst, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_sub_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_sub_64_ret:
+; CHECK:       .Ltest_fetch_sub_64_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_sub_64_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r2 = -r2
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 = r2
+; CHECK-NEXT:    r0 = atomic_fetch_add((u64 *)(r1 + 0), r0)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw sub ptr %i, i64 10 acquire, align 8
+  %1 = atomicrmw sub ptr %i, i64 10 release, align 8
+  %add = add nsw i64 %1, %0
+  %2 = atomicrmw sub ptr %i, i64 10 acq_rel, align 8
+  %add5 = add nsw i64 %add, %2
+  %3 = atomicrmw sub ptr %i, i64 10 seq_cst, align 8
+  %add8 = add nsw i64 %add5, %3
+  ret i64 %add8
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_and_64_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_and_64_noret:
+; CHECK:       .Ltest_fetch_and_64_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_and_64_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) &= r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r2 = atomic_fetch_and((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw and ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw and ptr %i, i64 10 acquire, align 8
+  %2 = atomicrmw and ptr %i, i64 10 release, align 8
+  %3 = atomicrmw and ptr %i, i64 10 acq_rel, align 8
+  %4 = atomicrmw and ptr %i, i64 10 seq_cst, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_and_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_and_64_ret:
+; CHECK:       .Ltest_fetch_and_64_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_and_64_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 = 10
+; CHECK-NEXT:    r0 = atomic_fetch_and((u64 *)(r1 + 0), r0)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r2 = atomic_fetch_and((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw and ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw and ptr %i, i64 10 acquire, align 8
+  %add = add nsw i64 %1, %0
+  %2 = atomicrmw and ptr %i, i64 10 release, align 8
+  %add5 = add nsw i64 %add, %2
+  %3 = atomicrmw and ptr %i, i64 10 acq_rel, align 8
+  %add8 = add nsw i64 %add5, %3
+  %4 = atomicrmw and ptr %i, i64 10 seq_cst, align 8
+  %add11 = add nsw i64 %add8, %4
+  ret i64 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_or_64_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_or_64_noret:
+; CHECK:       .Ltest_fetch_or_64_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_or_64_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) |= r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r2 = atomic_fetch_or((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw or ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw or ptr %i, i64 10 acquire, align 8
+  %2 = atomicrmw or ptr %i, i64 10 release, align 8
+  %3 = atomicrmw or ptr %i, i64 10 acq_rel, align 8
+  %4 = atomicrmw or ptr %i, i64 10 seq_cst, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_or_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_or_64_ret:
+; CHECK:       .Ltest_fetch_or_64_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_or_64_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 = 10
+; CHECK-NEXT:    r0 = atomic_fetch_or((u64 *)(r1 + 0), r0)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r2 = atomic_fetch_or((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw or ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw or ptr %i, i64 10 acquire, align 8
+  %add = add nsw i64 %1, %0
+  %2 = atomicrmw or ptr %i, i64 10 release, align 8
+  %add5 = add nsw i64 %add, %2
+  %3 = atomicrmw or ptr %i, i64 10 acq_rel, align 8
+  %add8 = add nsw i64 %add5, %3
+  %4 = atomicrmw or ptr %i, i64 10 seq_cst, align 8
+  %add11 = add nsw i64 %add8, %4
+  ret i64 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_xor_64_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_xor_64_noret:
+; CHECK:       .Ltest_fetch_xor_64_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_xor_64_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) ^= r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r2 = atomic_fetch_xor((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw xor ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw xor ptr %i, i64 10 acquire, align 8
+  %2 = atomicrmw xor ptr %i, i64 10 release, align 8
+  %3 = atomicrmw xor ptr %i, i64 10 acq_rel, align 8
+  %4 = atomicrmw xor ptr %i, i64 10 seq_cst, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_xor_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_xor_64_ret:
+; CHECK:       .Ltest_fetch_xor_64_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_xor_64_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 = 10
+; CHECK-NEXT:    r0 = atomic_fetch_xor((u64 *)(r1 + 0), r0)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r2 = atomic_fetch_xor((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw xor ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw xor ptr %i, i64 10 acquire, align 8
+  %add = add nsw i64 %1, %0
+  %2 = atomicrmw xor ptr %i, i64 10 release, align 8
+  %add5 = add nsw i64 %add, %2
+  %3 = atomicrmw xor ptr %i, i64 10 acq_rel, align 8
+  %add8 = add nsw i64 %add5, %3
+  %4 = atomicrmw xor ptr %i, i64 10 seq_cst, align 8
+  %add11 = add nsw i64 %add8, %4
+  ret i64 %add11
+}
+
+attributes #0 = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="v1" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version 20.0.0git (git at github.com:yonghong-song/llvm-project.git 6f71e34e194dab5a52cb2211af575c6067e9e504)"}
diff --git a/llvm/test/CodeGen/BPF/atomics_mem_order_v3.ll b/llvm/test/CodeGen/BPF/atomics_mem_order_v3.ll
new file mode 100644
index 00000000000000..20b9ebcb0d473b
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/atomics_mem_order_v3.ll
@@ -0,0 +1,781 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=bpfel -mcpu=v3 -filetype=asm < %s | FileCheck %s
+;
+; Source:
+; $ cat atomics_mem_order_v3.c
+;   #include <stdatomic.h>
+;
+;   void test_fetch_add_32_noret(int _Atomic *i) {
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_seq_cst);
+;   }
+;
+;   int test_fetch_add_32_ret(int _Atomic *i) {
+;     return __c11_atomic_fetch_add(i, 10, memory_order_relaxed) +
+;            __c11_atomic_fetch_add(i, 10, memory_order_acquire) +
+;            __c11_atomic_fetch_add(i, 10, memory_order_release) +
+;            __c11_atomic_fetch_add(i, 10, memory_order_acq_rel) +
+;            __c11_atomic_fetch_add(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_add_64_noret(long _Atomic *i) {
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_add(i, 10, memory_order_seq_cst);
+;   }
+;
+;   long test_fetch_add_64_ret(long _Atomic *i) {
+;     return __c11_atomic_fetch_add(i, 10, memory_order_relaxed) +
+;            __c11_atomic_fetch_add(i, 10, memory_order_acquire) +
+;            __c11_atomic_fetch_add(i, 10, memory_order_release) +
+;            __c11_atomic_fetch_add(i, 10, memory_order_acq_rel) +
+;            __c11_atomic_fetch_add(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_sub_32_noret(int _Atomic *i) {
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_seq_cst);
+;   }
+;
+;   int test_fetch_sub_32_ret(int _Atomic *i) {
+;      return __c11_atomic_fetch_sub(i, 10, memory_order_relaxed) +
+;             __c11_atomic_fetch_sub(i, 10, memory_order_acquire) +
+;             __c11_atomic_fetch_sub(i, 10, memory_order_release) +
+;             __c11_atomic_fetch_sub(i, 10, memory_order_acq_rel) +
+;             __c11_atomic_fetch_sub(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_sub_64_noret(long _Atomic *i) {
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_sub(i, 10, memory_order_seq_cst);
+;   }
+;
+;   long test_fetch_sub_64_ret(long _Atomic *i) {
+;     return __c11_atomic_fetch_sub(i, 10, memory_order_relaxed) +
+;            __c11_atomic_fetch_sub(i, 10, memory_order_acquire) +
+;            __c11_atomic_fetch_sub(i, 10, memory_order_release) +
+;            __c11_atomic_fetch_sub(i, 10, memory_order_acq_rel) +
+;            __c11_atomic_fetch_sub(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_and_32_noret(int _Atomic *i) {
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
+;   }
+;
+;   int test_fetch_and_32_ret(int _Atomic *i) {
+;     return __c11_atomic_fetch_and(i, 10, memory_order_relaxed) +
+;            __c11_atomic_fetch_and(i, 10, memory_order_acquire) +
+;            __c11_atomic_fetch_and(i, 10, memory_order_release) +
+;            __c11_atomic_fetch_and(i, 10, memory_order_acq_rel) +
+;            __c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_and_64_noret(long _Atomic *i) {
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
+;   }
+;
+;   long test_fetch_and_64_ret(long _Atomic *i) {
+;      return __c11_atomic_fetch_and(i, 10, memory_order_relaxed) +
+;             __c11_atomic_fetch_and(i, 10, memory_order_acquire) +
+;             __c11_atomic_fetch_and(i, 10, memory_order_release) +
+;             __c11_atomic_fetch_and(i, 10, memory_order_acq_rel) +
+;             __c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_or_32_noret(int _Atomic *i) {
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_seq_cst);
+;   }
+;
+;   int test_fetch_or_32_ret(int _Atomic *i) {
+;      return __c11_atomic_fetch_or(i, 10, memory_order_relaxed) +
+;             __c11_atomic_fetch_or(i, 10, memory_order_acquire) +
+;             __c11_atomic_fetch_or(i, 10, memory_order_release) +
+;             __c11_atomic_fetch_or(i, 10, memory_order_acq_rel) +
+;             __c11_atomic_fetch_or(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_or_64_noret(long _Atomic *i) {
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_or(i, 10, memory_order_seq_cst);
+;   }
+;
+;   long test_fetch_or_64_ret(long _Atomic *i) {
+;     return __c11_atomic_fetch_or(i, 10, memory_order_relaxed) +
+;            __c11_atomic_fetch_or(i, 10, memory_order_acquire) +
+;            __c11_atomic_fetch_or(i, 10, memory_order_release) +
+;            __c11_atomic_fetch_or(i, 10, memory_order_acq_rel) +
+;            __c11_atomic_fetch_or(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_xor_32_noret(int _Atomic *i) {
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_seq_cst);
+;   }
+;
+;   int test_fetch_xor_32_ret(int _Atomic *i) {
+;     return __c11_atomic_fetch_xor(i, 10, memory_order_relaxed) +
+;            __c11_atomic_fetch_xor(i, 10, memory_order_acquire) +
+;            __c11_atomic_fetch_xor(i, 10, memory_order_release) +
+;            __c11_atomic_fetch_xor(i, 10, memory_order_acq_rel) +
+;            __c11_atomic_fetch_xor(i, 10, memory_order_seq_cst);
+;   }
+;
+;   void test_fetch_xor_64_noret(long _Atomic *i) {
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_relaxed);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_acquire);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_release);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_acq_rel);
+;     (void)__c11_atomic_fetch_xor(i, 10, memory_order_seq_cst);
+;   }
+;
+;   long test_fetch_xor_64_ret(long _Atomic *i) {
+;      return __c11_atomic_fetch_xor(i, 10, memory_order_relaxed) +
+;             __c11_atomic_fetch_xor(i, 10, memory_order_acquire) +
+;             __c11_atomic_fetch_xor(i, 10, memory_order_release) +
+;             __c11_atomic_fetch_xor(i, 10, memory_order_acq_rel) +
+;             __c11_atomic_fetch_xor(i, 10, memory_order_seq_cst);
+;   }
+
+target triple = "bpf"
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_add_32_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_add_32_noret:
+; CHECK:       .Ltest_fetch_add_32_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_add_32_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    w2 = 10
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) += w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_add((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_add((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_add((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w2 = atomic_fetch_add((u32 *)(r1 + 0), w2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw add ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw add ptr %i, i32 10 acquire, align 4
+  %2 = atomicrmw add ptr %i, i32 10 release, align 4
+  %3 = atomicrmw add ptr %i, i32 10 acq_rel, align 4
+  %4 = atomicrmw add ptr %i, i32 10 seq_cst, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i32 @test_fetch_add_32_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_add_32_ret:
+; CHECK:       .Ltest_fetch_add_32_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_add_32_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    w2 = 10
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) += w3
+; CHECK-NEXT:    w0 = 10
+; CHECK-NEXT:    w0 = atomic_fetch_add((u32 *)(r1 + 0), w0)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_add((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_add((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w2 = atomic_fetch_add((u32 *)(r1 + 0), w2)
+; CHECK-NEXT:    w0 += w2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw add ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw add ptr %i, i32 10 acquire, align 4
+  %add = add nsw i32 %1, %0
+  %2 = atomicrmw add ptr %i, i32 10 release, align 4
+  %add5 = add nsw i32 %add, %2
+  %3 = atomicrmw add ptr %i, i32 10 acq_rel, align 4
+  %add8 = add nsw i32 %add5, %3
+  %4 = atomicrmw add ptr %i, i32 10 seq_cst, align 4
+  %add11 = add nsw i32 %add8, %4
+  ret i32 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_add_64_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_add_64_noret:
+; CHECK:       .Ltest_fetch_add_64_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_add_64_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw add ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw add ptr %i, i64 10 acquire, align 8
+  %2 = atomicrmw add ptr %i, i64 10 release, align 8
+  %3 = atomicrmw add ptr %i, i64 10 acq_rel, align 8
+  %4 = atomicrmw add ptr %i, i64 10 seq_cst, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_add_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_add_64_ret:
+; CHECK:       .Ltest_fetch_add_64_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_add_64_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) += r3
+; CHECK-NEXT:    r0 = 10
+; CHECK-NEXT:    r0 = atomic_fetch_add((u64 *)(r1 + 0), r0)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw add ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw add ptr %i, i64 10 acquire, align 8
+  %add = add nsw i64 %1, %0
+  %2 = atomicrmw add ptr %i, i64 10 release, align 8
+  %add5 = add nsw i64 %add, %2
+  %3 = atomicrmw add ptr %i, i64 10 acq_rel, align 8
+  %add8 = add nsw i64 %add5, %3
+  %4 = atomicrmw add ptr %i, i64 10 seq_cst, align 8
+  %add11 = add nsw i64 %add8, %4
+  ret i64 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_sub_32_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_sub_32_noret:
+; CHECK:       .Ltest_fetch_sub_32_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_sub_32_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    w2 = 10
+; CHECK-NEXT:    w2 = -w2
+; CHECK-NEXT:    w3 = w2
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) += w3
+; CHECK-NEXT:    w3 = w2
+; CHECK-NEXT:    w3 = atomic_fetch_add((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w3 = w2
+; CHECK-NEXT:    w3 = atomic_fetch_add((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w3 = w2
+; CHECK-NEXT:    w3 = atomic_fetch_add((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w2 = atomic_fetch_add((u32 *)(r1 + 0), w2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw sub ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw sub ptr %i, i32 10 acquire, align 4
+  %2 = atomicrmw sub ptr %i, i32 10 release, align 4
+  %3 = atomicrmw sub ptr %i, i32 10 acq_rel, align 4
+  %4 = atomicrmw sub ptr %i, i32 10 seq_cst, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i32 @test_fetch_sub_32_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_sub_32_ret:
+; CHECK:       .Ltest_fetch_sub_32_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_sub_32_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    w2 = 10
+; CHECK-NEXT:    w2 = -w2
+; CHECK-NEXT:    w3 = w2
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) += w3
+; CHECK-NEXT:    w0 = w2
+; CHECK-NEXT:    w0 = atomic_fetch_add((u32 *)(r1 + 0), w0)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w3 = w2
+; CHECK-NEXT:    w3 = atomic_fetch_add((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w3 = w2
+; CHECK-NEXT:    w3 = atomic_fetch_add((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w2 = atomic_fetch_add((u32 *)(r1 + 0), w2)
+; CHECK-NEXT:    w0 += w2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw sub ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw sub ptr %i, i32 10 acquire, align 4
+  %add = add nsw i32 %1, %0
+  %2 = atomicrmw sub ptr %i, i32 10 release, align 4
+  %add5 = add nsw i32 %add, %2
+  %3 = atomicrmw sub ptr %i, i32 10 acq_rel, align 4
+  %add8 = add nsw i32 %add5, %3
+  %4 = atomicrmw sub ptr %i, i32 10 seq_cst, align 4
+  %add11 = add nsw i32 %add8, %4
+  ret i32 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_sub_64_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_sub_64_noret:
+; CHECK:       .Ltest_fetch_sub_64_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_sub_64_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r2 = -r2
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) += r3
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw sub ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw sub ptr %i, i64 10 acquire, align 8
+  %2 = atomicrmw sub ptr %i, i64 10 release, align 8
+  %3 = atomicrmw sub ptr %i, i64 10 acq_rel, align 8
+  %4 = atomicrmw sub ptr %i, i64 10 seq_cst, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_sub_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_sub_64_ret:
+; CHECK:       .Ltest_fetch_sub_64_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_sub_64_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r2 = -r2
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) += r3
+; CHECK-NEXT:    r0 = r2
+; CHECK-NEXT:    r0 = atomic_fetch_add((u64 *)(r1 + 0), r0)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 = atomic_fetch_add((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw sub ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw sub ptr %i, i64 10 acquire, align 8
+  %add = add nsw i64 %1, %0
+  %2 = atomicrmw sub ptr %i, i64 10 release, align 8
+  %add5 = add nsw i64 %add, %2
+  %3 = atomicrmw sub ptr %i, i64 10 acq_rel, align 8
+  %add8 = add nsw i64 %add5, %3
+  %4 = atomicrmw sub ptr %i, i64 10 seq_cst, align 8
+  %add11 = add nsw i64 %add8, %4
+  ret i64 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_and_32_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_and_32_noret:
+; CHECK:       .Ltest_fetch_and_32_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_and_32_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    w2 = 10
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) &= w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_and((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_and((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_and((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w2 = atomic_fetch_and((u32 *)(r1 + 0), w2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw and ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw and ptr %i, i32 10 acquire, align 4
+  %2 = atomicrmw and ptr %i, i32 10 release, align 4
+  %3 = atomicrmw and ptr %i, i32 10 acq_rel, align 4
+  %4 = atomicrmw and ptr %i, i32 10 seq_cst, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i32 @test_fetch_and_32_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_and_32_ret:
+; CHECK:       .Ltest_fetch_and_32_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_and_32_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    w2 = 10
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) &= w3
+; CHECK-NEXT:    w0 = 10
+; CHECK-NEXT:    w0 = atomic_fetch_and((u32 *)(r1 + 0), w0)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_and((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_and((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w2 = atomic_fetch_and((u32 *)(r1 + 0), w2)
+; CHECK-NEXT:    w0 += w2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw and ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw and ptr %i, i32 10 acquire, align 4
+  %add = add nsw i32 %1, %0
+  %2 = atomicrmw and ptr %i, i32 10 release, align 4
+  %add5 = add nsw i32 %add, %2
+  %3 = atomicrmw and ptr %i, i32 10 acq_rel, align 4
+  %add8 = add nsw i32 %add5, %3
+  %4 = atomicrmw and ptr %i, i32 10 seq_cst, align 4
+  %add11 = add nsw i32 %add8, %4
+  ret i32 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_and_64_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_and_64_noret:
+; CHECK:       .Ltest_fetch_and_64_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_and_64_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) &= r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r2 = atomic_fetch_and((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw and ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw and ptr %i, i64 10 acquire, align 8
+  %2 = atomicrmw and ptr %i, i64 10 release, align 8
+  %3 = atomicrmw and ptr %i, i64 10 acq_rel, align 8
+  %4 = atomicrmw and ptr %i, i64 10 seq_cst, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_and_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_and_64_ret:
+; CHECK:       .Ltest_fetch_and_64_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_and_64_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 = 10
+; CHECK-NEXT:    r0 = atomic_fetch_and((u64 *)(r1 + 0), r0)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_and((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r2 = atomic_fetch_and((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw and ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw and ptr %i, i64 10 acquire, align 8
+  %add = add nsw i64 %1, %0
+  %2 = atomicrmw and ptr %i, i64 10 release, align 8
+  %add5 = add nsw i64 %add, %2
+  %3 = atomicrmw and ptr %i, i64 10 acq_rel, align 8
+  %add8 = add nsw i64 %add5, %3
+  %4 = atomicrmw and ptr %i, i64 10 seq_cst, align 8
+  %add11 = add nsw i64 %add8, %4
+  ret i64 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_or_32_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_or_32_noret:
+; CHECK:       .Ltest_fetch_or_32_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_or_32_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    w2 = 10
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) |= w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_or((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_or((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_or((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w2 = atomic_fetch_or((u32 *)(r1 + 0), w2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw or ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw or ptr %i, i32 10 acquire, align 4
+  %2 = atomicrmw or ptr %i, i32 10 release, align 4
+  %3 = atomicrmw or ptr %i, i32 10 acq_rel, align 4
+  %4 = atomicrmw or ptr %i, i32 10 seq_cst, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i32 @test_fetch_or_32_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_or_32_ret:
+; CHECK:       .Ltest_fetch_or_32_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_or_32_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    w2 = 10
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) |= w3
+; CHECK-NEXT:    w0 = 10
+; CHECK-NEXT:    w0 = atomic_fetch_or((u32 *)(r1 + 0), w0)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_or((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_or((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w2 = atomic_fetch_or((u32 *)(r1 + 0), w2)
+; CHECK-NEXT:    w0 += w2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw or ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw or ptr %i, i32 10 acquire, align 4
+  %add = add nsw i32 %1, %0
+  %2 = atomicrmw or ptr %i, i32 10 release, align 4
+  %add5 = add nsw i32 %add, %2
+  %3 = atomicrmw or ptr %i, i32 10 acq_rel, align 4
+  %add8 = add nsw i32 %add5, %3
+  %4 = atomicrmw or ptr %i, i32 10 seq_cst, align 4
+  %add11 = add nsw i32 %add8, %4
+  ret i32 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_or_64_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_or_64_noret:
+; CHECK:       .Ltest_fetch_or_64_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_or_64_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) |= r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r2 = atomic_fetch_or((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw or ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw or ptr %i, i64 10 acquire, align 8
+  %2 = atomicrmw or ptr %i, i64 10 release, align 8
+  %3 = atomicrmw or ptr %i, i64 10 acq_rel, align 8
+  %4 = atomicrmw or ptr %i, i64 10 seq_cst, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_or_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_or_64_ret:
+; CHECK:       .Ltest_fetch_or_64_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_or_64_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 = 10
+; CHECK-NEXT:    r0 = atomic_fetch_or((u64 *)(r1 + 0), r0)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_or((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r2 = atomic_fetch_or((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw or ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw or ptr %i, i64 10 acquire, align 8
+  %add = add nsw i64 %1, %0
+  %2 = atomicrmw or ptr %i, i64 10 release, align 8
+  %add5 = add nsw i64 %add, %2
+  %3 = atomicrmw or ptr %i, i64 10 acq_rel, align 8
+  %add8 = add nsw i64 %add5, %3
+  %4 = atomicrmw or ptr %i, i64 10 seq_cst, align 8
+  %add11 = add nsw i64 %add8, %4
+  ret i64 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_xor_32_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_xor_32_noret:
+; CHECK:       .Ltest_fetch_xor_32_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_xor_32_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    w2 = 10
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) ^= w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_xor((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_xor((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_xor((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w2 = atomic_fetch_xor((u32 *)(r1 + 0), w2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw xor ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw xor ptr %i, i32 10 acquire, align 4
+  %2 = atomicrmw xor ptr %i, i32 10 release, align 4
+  %3 = atomicrmw xor ptr %i, i32 10 acq_rel, align 4
+  %4 = atomicrmw xor ptr %i, i32 10 seq_cst, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i32 @test_fetch_xor_32_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_xor_32_ret:
+; CHECK:       .Ltest_fetch_xor_32_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_xor_32_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    w2 = 10
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    lock *(u32 *)(r1 + 0) ^= w3
+; CHECK-NEXT:    w0 = 10
+; CHECK-NEXT:    w0 = atomic_fetch_xor((u32 *)(r1 + 0), w0)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_xor((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w3 = 10
+; CHECK-NEXT:    w3 = atomic_fetch_xor((u32 *)(r1 + 0), w3)
+; CHECK-NEXT:    w0 += w3
+; CHECK-NEXT:    w2 = atomic_fetch_xor((u32 *)(r1 + 0), w2)
+; CHECK-NEXT:    w0 += w2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw xor ptr %i, i32 10 monotonic, align 4
+  %1 = atomicrmw xor ptr %i, i32 10 acquire, align 4
+  %add = add nsw i32 %1, %0
+  %2 = atomicrmw xor ptr %i, i32 10 release, align 4
+  %add5 = add nsw i32 %add, %2
+  %3 = atomicrmw xor ptr %i, i32 10 acq_rel, align 4
+  %add8 = add nsw i32 %add5, %3
+  %4 = atomicrmw xor ptr %i, i32 10 seq_cst, align 4
+  %add11 = add nsw i32 %add8, %4
+  ret i32 %add11
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local void @test_fetch_xor_64_noret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_xor_64_noret:
+; CHECK:       .Ltest_fetch_xor_64_noret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_xor_64_noret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    lock *(u64 *)(r1 + 0) ^= r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r2 = atomic_fetch_xor((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw xor ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw xor ptr %i, i64 10 acquire, align 8
+  %2 = atomicrmw xor ptr %i, i64 10 release, align 8
+  %3 = atomicrmw xor ptr %i, i64 10 acq_rel, align 8
+  %4 = atomicrmw xor ptr %i, i64 10 seq_cst, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_xor_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fetch_xor_64_ret:
+; CHECK:       .Ltest_fetch_xor_64_ret$local:
+; CHECK-NEXT:    .type .Ltest_fetch_xor_64_ret$local, at function
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    r2 = 10
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 = 10
+; CHECK-NEXT:    r0 = atomic_fetch_xor((u64 *)(r1 + 0), r0)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r3 = 10
+; CHECK-NEXT:    r3 = atomic_fetch_xor((u64 *)(r1 + 0), r3)
+; CHECK-NEXT:    r0 += r3
+; CHECK-NEXT:    r2 = atomic_fetch_xor((u64 *)(r1 + 0), r2)
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+entry:
+  %0 = atomicrmw xor ptr %i, i64 10 monotonic, align 8
+  %1 = atomicrmw xor ptr %i, i64 10 acquire, align 8
+  %add = add nsw i64 %1, %0
+  %2 = atomicrmw xor ptr %i, i64 10 release, align 8
+  %add5 = add nsw i64 %add, %2
+  %3 = atomicrmw xor ptr %i, i64 10 acq_rel, align 8
+  %add8 = add nsw i64 %add5, %3
+  %4 = atomicrmw xor ptr %i, i64 10 seq_cst, align 8
+  %add11 = add nsw i64 %add8, %4
+  ret i64 %add11
+}
+
+attributes #0 = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="v3" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version 20.0.0git (git at github.com:yonghong-song/llvm-project.git 6f71e34e194dab5a52cb2211af575c6067e9e504)"}
diff --git a/llvm/test/CodeGen/BPF/atomics_sub64_relaxed_v1.ll b/llvm/test/CodeGen/BPF/atomics_sub64_relaxed_v1.ll
new file mode 100644
index 00000000000000..4d630d475b2962
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/atomics_sub64_relaxed_v1.ll
@@ -0,0 +1,27 @@
+; RUN: not llc -march=bpfel -mcpu=v1 -filetype=asm < %s
+;
+; Source:
+; $ cat atomics_sub64_relaxed_v1.c
+;   #include <stdatomic.h>
+;
+;   long test_fetch_sub_64_ret(long _Atomic *i) {
+;      return __c11_atomic_fetch_sub(i, 10, memory_order_relaxed);
+;   }
+
+target triple = "bpf"
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_sub_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+entry:
+  %0 = atomicrmw sub ptr %i, i64 10 monotonic, align 8
+  ret i64 %0
+}
+
+attributes #0 = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="v1" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version 20.0.0git (git at github.com:yonghong-song/llvm-project.git 6f71e34e194dab5a52cb2211af575c6067e9e504)"}
diff --git a/llvm/test/CodeGen/BPF/xaddd_v1.ll b/llvm/test/CodeGen/BPF/xaddd_v1.ll
new file mode 100644
index 00000000000000..d3bfd8d81b15b5
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/xaddd_v1.ll
@@ -0,0 +1,25 @@
+; RUN: not llc -march=bpfel -mcpu=v1 -filetype=asm < %s
+;
+; Source:
+; $ cat xaddd_v1.c
+; long test_fetch_add_64_ret(long *i) {
+;   return __sync_fetch_and_add(i, 10);
+; }
+
+target triple = "bpf"
+
+; Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+define dso_local i64 @test_fetch_add_64_ret(ptr nocapture noundef %i) local_unnamed_addr #0 {
+entry:
+  %0 = atomicrmw add ptr %i, i64 10 seq_cst, align 8
+  ret i64 %0
+}
+
+attributes #0 = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="v1" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version 20.0.0git (git at github.com:yonghong-song/llvm-project.git 6f71e34e194dab5a52cb2211af575c6067e9e504)"}



More information about the llvm-commits mailing list