[clang] [llvm] [RFC][BPF] Do atomic_fetch_*() pattern matching with memory ordering (PR #107343)

Mon Sep 9 13:52:29 PDT 2024

https://github.com/yonghong-song updated https://github.com/llvm/llvm-project/pull/107343

>From bcca7b61a1b89698d259a7be84d9babfad674db7 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Tue, 3 Sep 2024 21:26:17 -0700
Subject: [PATCH 1/2] [RFC][BPF] Do atomic_fetch_*() pattern matching with
 memory ordering

For atomic fetch_and_*() operations, do pattern matching with memory ordering
seq_cst, acq_rel, release, acquire and monotonic (relaxed). For fetch_and_*()
operations with seq_cst/acq_rel/release/acquire ordering, atomic_fetch_*()
instructions are generated. For monotonic ordering, locked insns are generated
if return value is not used. Otherwise, atomic_fetch_*() insns are used.
The main motivation is to resolve the kernel issue [1].

The following are memory ordering are supported:
  seq_cst, acq_rel, release, acquire, relaxed
Current gcc style __sync_fetch_and_*() operations are all seq_cst.

To use explicit memory ordering, the _Atomic type is needed.  The following is
an example:

```
$ cat test.c
\#include <stdatomic.h>
void f1(_Atomic int *i) {
   (void)__c11_atomic_fetch_and(i, 10, memory_order_relaxed);
}
void f2(_Atomic int *i) {
   (void)__c11_atomic_fetch_and(i, 10, memory_order_acquire);
}
void f3(_Atomic int *i) {
   (void)__c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
}
$ cat run.sh
clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -c test.c -o test.o && llvm-objdump -d test.o
$ ./run.sh

test.o: file format elf64-bpf

Disassembly of section .text:

0000000000000000 <f1>:
       0:       b4 02 00 00 0a 00 00 00 w2 = 0xa
       1:       c3 21 00 00 50 00 00 00 lock *(u32 *)(r1 + 0x0) &= w2
       2:       95 00 00 00 00 00 00 00 exit

0000000000000018 <f2>:
       3:       b4 02 00 00 0a 00 00 00 w2 = 0xa
       4:       c3 21 00 00 51 00 00 00 w2 = atomic_fetch_and((u32 *)(r1 + 0x0), w2)
       5:       95 00 00 00 00 00 00 00 exit

0000000000000030 <f3>:
       6:       b4 02 00 00 0a 00 00 00 w2 = 0xa
       7:       c3 21 00 00 51 00 00 00 w2 = atomic_fetch_and((u32 *)(r1 + 0x0), w2)
       8:       95 00 00 00 00 00 00 00 exit
```

The following is another example where return value is used:

```
$ cat test1.c
\#include <stdatomic.h>
int f1(_Atomic int *i) {
   return __c11_atomic_fetch_and(i, 10, memory_order_relaxed);
}
int f2(_Atomic int *i) {
   return __c11_atomic_fetch_and(i, 10, memory_order_acquire);
}
int f3(_Atomic int *i) {
   return __c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
}
$ cat run.sh
clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -c test1.c -o test1.o && llvm-objdump -d test1.o
$ ./run.sh

test.o: file format elf64-bpf

Disassembly of section .text:

0000000000000000 <f1>:
       0:       b4 00 00 00 0a 00 00 00 w0 = 0xa
       1:       c3 01 00 00 51 00 00 00 w0 = atomic_fetch_and((u32 *)(r1 + 0x0), w0)
       2:       95 00 00 00 00 00 00 00 exit

0000000000000018 <f2>:
       3:       b4 00 00 00 0a 00 00 00 w0 = 0xa
       4:       c3 01 00 00 51 00 00 00 w0 = atomic_fetch_and((u32 *)(r1 + 0x0), w0)
       5:       95 00 00 00 00 00 00 00 exit

0000000000000030 <f3>:
       6:       b4 00 00 00 0a 00 00 00 w0 = 0xa
       7:       c3 01 00 00 51 00 00 00 w0 = atomic_fetch_and((u32 *)(r1 + 0x0), w0)
       8:       95 00 00 00 00 00 00 00 exit
```

You can see that for relaxed memory ordering, if return value is used, atomic_fetch_and()
insn is used. Otherwise, if return value is not used, locked insn is used.

Here is another example with global _Atomic variable:

```
$ cat test3.c
\#include <stdatomic.h>

_Atomic int i;

void f1(void) {
   (void)__c11_atomic_fetch_and(&i, 10, memory_order_relaxed);
}
void f2(void) {
   (void)__c11_atomic_fetch_and(&i, 10, memory_order_seq_cst);
}
$ cat run.sh
clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -c test3.c -o test3.o && llvm-objdump -d test3.o
$ ./run.sh

test3.o:        file format elf64-bpf

Disassembly of section .text:

0000000000000000 <f1>:
       0:       b4 01 00 00 0a 00 00 00 w1 = 0xa
       1:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
       3:       c3 12 00 00 50 00 00 00 lock *(u32 *)(r2 + 0x0) &= w1
       4:       95 00 00 00 00 00 00 00 exit

0000000000000028 <f2>:
       5:       b4 01 00 00 0a 00 00 00 w1 = 0xa
       6:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
       8:       c3 12 00 00 51 00 00 00 w1 = atomic_fetch_and((u32 *)(r2 + 0x0), w1)
       9:       95 00 00 00 00 00 00 00 exit
```

Note that in the above compilations, '-g' is not used. The reason is due to the following IR
related to _Atomic type:
```
$clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -g -S -emit-llvm test3.c
```
The related debug info for test3.c:
```
!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
!1 = distinct !DIGlobalVariable(name: "i", scope: !2, file: !3, line: 3, type: !16, isLocal: false, isDefinition: true)
...
!16 = !DIDerivedType(tag: DW_TAG_atomic_type, baseType: !17)
!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
```

If compiling test.c, the related debug info:
```
...
!19 = distinct !DISubprogram(name: "f1", scope: !1, file: !1, line: 3, type: !20, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !25)
!20 = !DISubroutineType(types: !21)
!21 = !{null, !22}
!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
!23 = !DIDerivedType(tag: DW_TAG_atomic_type, baseType: !24)
!24 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!25 = !{!26}
!26 = !DILocalVariable(name: "i", arg: 1, scope: !19, file: !1, line: 3, type: !22)
```

All the above suggests _Atomic behaves like a modifier (e.g. const, restrict, volatile).
This seems true based on doc [1].

Without proper handling DW_TAG_atomic_type, llvm BTF generation will be incorrect since
the current implementation assumes no existence of DW_TAG_atomic_type. So we have
two choices here:
  (1). llvm bpf backend processes DW_TAG_atomic_type but ignores it in BTF encoding.
  (2). Add another type, e.g., BTF_KIND_ATOMIC to BTF. BTF_KIND_ATOMIC behaves as a
       modifier like const/volatile/restrict.

For choice (1), llvm bpf backend should skip dwarf::DW_TAG_atomic_type during
BTF generation whenever necessary.

For choice (2), BTF_KIND_ATOMIC will be added to BTF so llvm backend and kernel
needs to handle that properly. The main advantage of it probably is to maintain
this atomic type so it is also available to skeleton. But I think for skeleton
a raw type might be good enough unless user space intends to do some atomic
operation with that, which is a unlikely case.

So I choose choice (2) in this RFC implementation.

 [1] https://lore.kernel.org/bpf/7b941f53-2a05-48ec-9032-8f106face3a3@linux.dev/
 [2] https://dwarfstd.org/issues/131112.1.html
---
 clang/lib/Basic/Targets/BPF.cpp       |   1 +
 llvm/lib/Target/BPF/BPFInstrInfo.td   | 113 +++++++++++++++++++++++---
 llvm/lib/Target/BPF/BPFMIChecking.cpp |  91 ++++++++++++++++++---
 3 files changed, 184 insertions(+), 21 deletions(-)

diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp
index a94ceee5a6a5e7..77e3a9388b0c46 100644
--- a/clang/lib/Basic/Targets/BPF.cpp
+++ b/clang/lib/Basic/Targets/BPF.cpp
@@ -37,6 +37,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
   }
 
   Builder.defineMacro("__BPF_FEATURE_ADDR_SPACE_CAST");
+  Builder.defineMacro("__BPF_FEATURE_ATOMIC_MEM_ORDERING");
 
   if (CPU.empty())
     CPU = "v3";
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index f7e17901c7ed5e..68b0d1b70efe20 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -864,26 +864,119 @@ class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
 
 let Constraints = "$dst = $val" in {
   let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
-    def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32>;
-    def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_i32>;
-    def XFORW32  : XFALU32<BPF_W, BPF_OR,  "u32", "or",  atomic_load_or_i32>;
-    def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_i32>;
+    def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32_seq_cst>;
+    def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_i32_seq_cst>;
+    def XFORW32  : XFALU32<BPF_W, BPF_OR,  "u32", "or",  atomic_load_or_i32_seq_cst>;
+    def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_i32_seq_cst>;
   }
 
   let Predicates = [BPFHasALU32] in {
-    def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64>;
+    def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64_seq_cst>;
   }
-  def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_i64>;
-  def XFORD  : XFALU64<BPF_DW, BPF_OR,  "u64", "or",  atomic_load_or_i64>;
-  def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_i64>;
+  def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_i64_seq_cst>;
+  def XFORD  : XFALU64<BPF_DW, BPF_OR,  "u64", "or",  atomic_load_or_i64_seq_cst>;
+  def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_i64_seq_cst>;
+}
+
+let Predicates = [BPFHasALU32] in {
+    def : Pat<(atomic_load_add_i32_monotonic ADDRri:$addr, GPR32:$val),
+              (XADDW32 ADDRri:$addr, GPR32:$val)>;
+    def : Pat<(atomic_load_add_i32_acquire ADDRri:$addr, GPR32:$val),
+              (XFADDW32 ADDRri:$addr, GPR32:$val)>;
+    def : Pat<(atomic_load_add_i32_release ADDRri:$addr, GPR32:$val),
+              (XFADDW32 ADDRri:$addr, GPR32:$val)>;
+    def : Pat<(atomic_load_add_i32_acq_rel ADDRri:$addr, GPR32:$val),
+              (XFADDW32 ADDRri:$addr, GPR32:$val)>;
+
+    def : Pat<(atomic_load_add_i64_monotonic ADDRri:$addr, GPR:$val),
+              (XADDD ADDRri:$addr, GPR:$val)>;
+    def : Pat<(atomic_load_add_i64_acquire ADDRri:$addr, GPR:$val),
+              (XFADDD ADDRri:$addr, GPR:$val)>;
+    def : Pat<(atomic_load_add_i64_release ADDRri:$addr, GPR:$val),
+              (XFADDD ADDRri:$addr, GPR:$val)>;
+    def : Pat<(atomic_load_add_i64_acq_rel ADDRri:$addr, GPR:$val),
+              (XFADDD ADDRri:$addr, GPR:$val)>;
 }
 
 // atomic_load_sub can be represented as a neg followed
 // by an atomic_load_add.
-def : Pat<(atomic_load_sub_i32 ADDRri:$addr, GPR32:$val),
+// FIXME: the below can probably be simplified.
+def : Pat<(atomic_load_sub_i32_monotonic ADDRri:$addr, GPR32:$val),
+          (XADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
+def : Pat<(atomic_load_sub_i32_acquire ADDRri:$addr, GPR32:$val),
+          (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
+def : Pat<(atomic_load_sub_i32_release ADDRri:$addr, GPR32:$val),
+          (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
+def : Pat<(atomic_load_sub_i32_acq_rel ADDRri:$addr, GPR32:$val),
+          (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
+def : Pat<(atomic_load_sub_i32_seq_cst ADDRri:$addr, GPR32:$val),
           (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
-def : Pat<(atomic_load_sub_i64 ADDRri:$addr, GPR:$val),
+
+def : Pat<(atomic_load_sub_i64_monotonic ADDRri:$addr, GPR:$val),
+          (XADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+def : Pat<(atomic_load_sub_i64_acquire ADDRri:$addr, GPR:$val),
+          (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+def : Pat<(atomic_load_sub_i64_release ADDRri:$addr, GPR:$val),
+          (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+def : Pat<(atomic_load_sub_i64_acq_rel ADDRri:$addr, GPR:$val),
           (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+def : Pat<(atomic_load_sub_i64_seq_cst ADDRri:$addr, GPR:$val),
+          (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+
+def : Pat<(atomic_load_and_i32_monotonic ADDRri:$addr, GPR32:$val),
+          (XANDW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_and_i32_acquire ADDRri:$addr, GPR32:$val),
+          (XFANDW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_and_i32_release ADDRri:$addr, GPR32:$val),
+          (XFANDW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_and_i32_acq_rel ADDRri:$addr, GPR32:$val),
+          (XFANDW32 ADDRri:$addr, GPR32:$val)>;
+
+
+def : Pat<(atomic_load_and_i64_monotonic ADDRri:$addr, GPR:$val),
+          (XANDD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_and_i64_acquire ADDRri:$addr, GPR:$val),
+          (XFANDD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_and_i64_release ADDRri:$addr, GPR:$val),
+          (XFANDD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_and_i64_acq_rel ADDRri:$addr, GPR:$val),
+          (XFANDD ADDRri:$addr, GPR:$val)>;
+
+def : Pat<(atomic_load_or_i32_monotonic ADDRri:$addr, GPR32:$val),
+          (XORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_or_i32_acquire ADDRri:$addr, GPR32:$val),
+          (XFORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_or_i32_release ADDRri:$addr, GPR32:$val),
+          (XFORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_or_i32_acq_rel ADDRri:$addr, GPR32:$val),
+          (XFORW32 ADDRri:$addr, GPR32:$val)>;
+
+def : Pat<(atomic_load_or_i64_monotonic ADDRri:$addr, GPR:$val),
+          (XORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_or_i64_acquire ADDRri:$addr, GPR:$val),
+          (XFORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_or_i64_release ADDRri:$addr, GPR:$val),
+          (XFORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_or_i64_acq_rel ADDRri:$addr, GPR:$val),
+          (XFORD ADDRri:$addr, GPR:$val)>;
+
+def : Pat<(atomic_load_xor_i32_monotonic ADDRri:$addr, GPR32:$val),
+          (XXORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_xor_i32_acquire ADDRri:$addr, GPR32:$val),
+          (XFXORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_xor_i32_release ADDRri:$addr, GPR32:$val),
+          (XFXORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_xor_i32_acq_rel ADDRri:$addr, GPR32:$val),
+          (XFXORW32 ADDRri:$addr, GPR32:$val)>;
+
+def : Pat<(atomic_load_xor_i64_monotonic ADDRri:$addr, GPR:$val),
+          (XXORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_xor_i64_acquire ADDRri:$addr, GPR:$val),
+          (XFXORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_xor_i64_release ADDRri:$addr, GPR:$val),
+          (XFXORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_xor_i64_acq_rel ADDRri:$addr, GPR:$val),
+          (XFXORD ADDRri:$addr, GPR:$val)>;
 
 // Atomic Exchange
 class XCHG<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp
index 24224f6c1e9e66..6010539d21bad0 100644
--- a/llvm/lib/Target/BPF/BPFMIChecking.cpp
+++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp
@@ -43,14 +43,14 @@ struct BPFMIPreEmitChecking : public MachineFunctionPass {
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
-  void processAtomicInsts();
+  bool processAtomicInsts();
 
 public:
   // Main entry point for this pass.
   bool runOnMachineFunction(MachineFunction &MF) override {
     if (!skipFunction(MF.getFunction())) {
       initialize(MF);
-      processAtomicInsts();
+      return processAtomicInsts();
     }
     return false;
   }
@@ -152,22 +152,91 @@ static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
   return false;
 }
 
-void BPFMIPreEmitChecking::processAtomicInsts() {
+bool BPFMIPreEmitChecking::processAtomicInsts() {
+  if (!MF->getSubtarget<BPFSubtarget>().getHasJmp32()) {
+    // Only check for cpu version 1 and 2.
+    for (MachineBasicBlock &MBB : *MF) {
+      for (MachineInstr &MI : MBB) {
+        if (MI.getOpcode() != BPF::XADDW && MI.getOpcode() != BPF::XADDD)
+          continue;
+
+        LLVM_DEBUG(MI.dump());
+        if (hasLiveDefs(MI, TRI)) {
+          DebugLoc Empty;
+          const DebugLoc &DL = MI.getDebugLoc();
+          const Function &F = MF->getFunction();
+          F.getContext().diagnose(DiagnosticInfoUnsupported{
+              F, "Invalid usage of the XADD return value", DL});
+        }
+      }
+    }
+  }
+
+  // Check return values of atomic_fetch_and_{add,and,or,xor}.
+  // If the return is not used, the atomic_fetch_and_<op> instruction
+  // is replaced with atomic_<op> instruction.
+  MachineInstr *ToErase = nullptr;
+  bool Changed = false;
+  const BPFInstrInfo *TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
   for (MachineBasicBlock &MBB : *MF) {
     for (MachineInstr &MI : MBB) {
-      if (MI.getOpcode() != BPF::XADDW && MI.getOpcode() != BPF::XADDD)
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      if (MI.getOpcode() != BPF::XADDW32 && MI.getOpcode() != BPF::XADDD &&
+          MI.getOpcode() != BPF::XANDW32 && MI.getOpcode() != BPF::XANDD &&
+          MI.getOpcode() != BPF::XXORW32 && MI.getOpcode() != BPF::XXORD &&
+          MI.getOpcode() != BPF::XORW32 && MI.getOpcode() != BPF::XORD)
         continue;
 
-      LLVM_DEBUG(MI.dump());
-      if (hasLiveDefs(MI, TRI)) {
-        DebugLoc Empty;
-        const DebugLoc &DL = MI.getDebugLoc();
-        const Function &F = MF->getFunction();
-        F.getContext().diagnose(DiagnosticInfoUnsupported{
-            F, "Invalid usage of the XADD return value", DL});
+      if (!hasLiveDefs(MI, TRI))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Transforming "; MI.dump());
+      unsigned newOpcode;
+      switch (MI.getOpcode()) {
+      case BPF::XADDW32:
+        newOpcode = BPF::XFADDW32;
+        break;
+      case BPF::XADDD:
+        newOpcode = BPF::XFADDD;
+        break;
+      case BPF::XANDW32:
+        newOpcode = BPF::XFANDW32;
+        break;
+      case BPF::XANDD:
+        newOpcode = BPF::XFANDD;
+        break;
+      case BPF::XXORW32:
+        newOpcode = BPF::XFXORW32;
+        break;
+      case BPF::XXORD:
+        newOpcode = BPF::XFXORD;
+        break;
+      case BPF::XORW32:
+        newOpcode = BPF::XFORW32;
+        break;
+      case BPF::XORD:
+        newOpcode = BPF::XFORD;
+        break;
+      default:
+        llvm_unreachable("Incorrect Atomic Instruction Opcode");
       }
+
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(newOpcode))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(1))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(3));
+
+      ToErase = &MI;
+      Changed = true;
     }
   }
+
+  return Changed;
 }
 
 } // namespace

>From 8b82c54222202c097184019d6c64b3b9bb3f96fa Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Mon, 9 Sep 2024 11:04:17 -0700
Subject: [PATCH 2/2] [RFC][BPF] Handle DW_TAG_atomic_type properly

Make change in BTFDebug.cpp to handle DW_TAG_atomic_type properly.
Otherwise, a type like
   _Atomic int i; // global
the dwarf type chain atomic->int
Since DW_TAG_atomic_type is not processed BTF generation will stop
at atomic modifier and BTF will encode 'i' as void type.

Similar for type like
  volatile _Atomic int *p;
the dwarf type chain ptr->volatile->atomic->int
Since atomic type is not processed and BTF generation will stop at
atomic type, the eventual BTF type will be
  ptr->volatile->void
which is incorrect.

This patch fixed the above two patterns by skipping
DW_TAG_atomic_type. There could be more cases which I will try
to fix those with more test cases. This is a RFC patch and the
current implementation is good enough to run kernel selftests
with _Atomic usage ([1]).

In kernel selftest arena_atomics.c, the new bpf code looks like

```
_Atomic __u64 __arena_global and64_value = (0x110ull << 32);
_Atomic __u32 __arena_global and32_value = 0x110;

SEC("raw_tp/sys_enter")
int and(const void *ctx)
{
	...
        __c11_atomic_fetch_and(&and64_value, 0x011ull << 32, memory_order_relaxed);
        __c11_atomic_fetch_and(&and32_value, 0x011, memory_order_relaxed);
	...

        return 0;
}
```

The skel file arena_atomics.skel.h has
```
struct arena_atomics__arena {
	...
	__u64 and64_value;
	__u32 and32_value;
	...
} *arena;
```

  [1] https://lore.kernel.org/bpf/7b941f53-2a05-48ec-9032-8f106face3a3@linux.dev/
---
 llvm/lib/Target/BPF/BTFDebug.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 4d847abea731dc..1cd82720fa7e81 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -91,6 +91,12 @@ void BTFTypeDerived::completeType(BTFDebug &BDebug) {
 
   // The base type for PTR/CONST/VOLATILE could be void.
   const DIType *ResolvedType = DTy->getBaseType();
+  if (ResolvedType) {
+    const auto *DerivedTy = dyn_cast<DIDerivedType>(ResolvedType);
+    if (DerivedTy && DerivedTy->getTag() == dwarf::DW_TAG_atomic_type)
+      ResolvedType = DerivedTy->getBaseType();
+  }
+
   if (!ResolvedType) {
     assert((Kind == BTF::BTF_KIND_PTR || Kind == BTF::BTF_KIND_CONST ||
             Kind == BTF::BTF_KIND_VOLATILE) &&
@@ -800,6 +806,10 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
                                 bool CheckPointer, bool SeenPointer) {
   unsigned Tag = DTy->getTag();
 
+  if (Tag == dwarf::DW_TAG_atomic_type)
+    return visitTypeEntry(DTy->getBaseType(), TypeId, CheckPointer,
+                          SeenPointer);
+
   /// Try to avoid chasing pointees, esp. structure pointees which may
   /// unnecessary bring in a lot of types.
   if (CheckPointer && !SeenPointer) {
@@ -1444,8 +1454,15 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
       DIGlobal = GVE->getVariable();
       if (SecName.starts_with(".maps"))
         visitMapDefType(DIGlobal->getType(), GVTypeId);
-      else
-        visitTypeEntry(DIGlobal->getType(), GVTypeId, false, false);
+      else {
+        const DIType *Ty = DIGlobal->getType();
+        if (Ty) {
+          auto *DTy = dyn_cast<DIDerivedType>(Ty);
+          if (DTy && DTy->getTag() == dwarf::DW_TAG_atomic_type)
+            Ty = DTy->getBaseType();
+        }
+        visitTypeEntry(Ty, GVTypeId, false, false);
+      }
       break;
     }