[llvm] [RFC][BPF] Do atomic_fetch_*() pattern matching with memory ordering (PR #107343)

Sun Sep 8 22:21:54 PDT 2024

https://github.com/yonghong-song updated https://github.com/llvm/llvm-project/pull/107343

>From a72772ed2b37314ec8a8d4909f94cf5aeac7d544 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Tue, 3 Sep 2024 21:26:17 -0700
Subject: [PATCH] [RFC][BPF] Do atomic_fetch_*() pattern matching with memory
 ordering

For atomic fetch_and_*() operations, do pattern matching with memory ordering
seq_cst, acq_rel, release, acquire and monotonic (relaxed). For fetch_and_*()
operations with seq_cst/acq_rel/release/acquire ordering, atomic_fetch_*()
instructions are generated. For monotonic ordering, locked insns are generated
if return value is not used. Otherwise, atomic_fetch_*() insns are used.
The main motivation is to resolve the kernel issue [1].

The following are memory ordering are supported:
  seq_cst, acq_rel, release, acquire, relaxed
Current gcc style __sync_fetch_and_*() operations are all seq_cst.

To use explicit memory ordering, the _Atomic type is needed.  The following is
an example:

```
$ cat test.c
\#include <stdatomic.h>
void f1(_Atomic int *i) {
   (void)__c11_atomic_fetch_and(i, 10, memory_order_relaxed);
}
void f2(_Atomic int *i) {
   (void)__c11_atomic_fetch_and(i, 10, memory_order_acquire);
}
void f3(_Atomic int *i) {
   (void)__c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
}
$ cat run.sh
clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -c test.c -o test.o && llvm-objdump -d test.o
$ ./run.sh

test.o: file format elf64-bpf

Disassembly of section .text:

0000000000000000 <f1>:
       0:       b4 02 00 00 0a 00 00 00 w2 = 0xa
       1:       c3 21 00 00 50 00 00 00 lock *(u32 *)(r1 + 0x0) &= w2
       2:       95 00 00 00 00 00 00 00 exit

0000000000000018 <f2>:
       3:       b4 02 00 00 0a 00 00 00 w2 = 0xa
       4:       c3 21 00 00 51 00 00 00 w2 = atomic_fetch_and((u32 *)(r1 + 0x0), w2)
       5:       95 00 00 00 00 00 00 00 exit

0000000000000030 <f3>:
       6:       b4 02 00 00 0a 00 00 00 w2 = 0xa
       7:       c3 21 00 00 51 00 00 00 w2 = atomic_fetch_and((u32 *)(r1 + 0x0), w2)
       8:       95 00 00 00 00 00 00 00 exit
```

The following is another example where return value is used:

```
$ cat test1.c
\#include <stdatomic.h>
int f1(_Atomic int *i) {
   return __c11_atomic_fetch_and(i, 10, memory_order_relaxed);
}
int f2(_Atomic int *i) {
   return __c11_atomic_fetch_and(i, 10, memory_order_acquire);
}
int f3(_Atomic int *i) {
   return __c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
}
$ cat run.sh
clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -c test1.c -o test1.o && llvm-objdump -d test1.o
$ ./run.sh

test.o: file format elf64-bpf

Disassembly of section .text:

0000000000000000 <f1>:
       0:       b4 00 00 00 0a 00 00 00 w0 = 0xa
       1:       c3 01 00 00 51 00 00 00 w0 = atomic_fetch_and((u32 *)(r1 + 0x0), w0)
       2:       95 00 00 00 00 00 00 00 exit

0000000000000018 <f2>:
       3:       b4 00 00 00 0a 00 00 00 w0 = 0xa
       4:       c3 01 00 00 51 00 00 00 w0 = atomic_fetch_and((u32 *)(r1 + 0x0), w0)
       5:       95 00 00 00 00 00 00 00 exit

0000000000000030 <f3>:
       6:       b4 00 00 00 0a 00 00 00 w0 = 0xa
       7:       c3 01 00 00 51 00 00 00 w0 = atomic_fetch_and((u32 *)(r1 + 0x0), w0)
       8:       95 00 00 00 00 00 00 00 exit
```

You can see that for relaxed memory ordering, if return value is used, atomic_fetch_and()
insn is used. Otherwise, if return value is not used, locked insn is used.

Here is another example with global _Atomic variable:

```
$ cat test3.c
\#include <stdatomic.h>

_Atomic int i;

void f1(void) {
   (void)__c11_atomic_fetch_and(&i, 10, memory_order_relaxed);
}
void f2(void) {
   (void)__c11_atomic_fetch_and(&i, 10, memory_order_seq_cst);
}
$ cat run.sh
clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -c test3.c -o test3.o && llvm-objdump -d test3.o
$ ./run.sh

test3.o:        file format elf64-bpf

Disassembly of section .text:

0000000000000000 <f1>:
       0:       b4 01 00 00 0a 00 00 00 w1 = 0xa
       1:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
       3:       c3 12 00 00 50 00 00 00 lock *(u32 *)(r2 + 0x0) &= w1
       4:       95 00 00 00 00 00 00 00 exit

0000000000000028 <f2>:
       5:       b4 01 00 00 0a 00 00 00 w1 = 0xa
       6:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
       8:       c3 12 00 00 51 00 00 00 w1 = atomic_fetch_and((u32 *)(r2 + 0x0), w1)
       9:       95 00 00 00 00 00 00 00 exit
```

Note that in the above compilations, '-g' is not used. The reason is due to the following IR
related to _Atomic type:
```
$clang  -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -O2 --target=bpf -g -S -emit-llvm test3.c
```
The related debug info for test3.c:
```
!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
!1 = distinct !DIGlobalVariable(name: "i", scope: !2, file: !3, line: 3, type: !16, isLocal: false, isDefinition: true)
...
!16 = !DIDerivedType(tag: DW_TAG_atomic_type, baseType: !17)
!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
...

If compiling test.c, the related debug info:
```
...
!19 = distinct !DISubprogram(name: "f1", scope: !1, file: !1, line: 3, type: !20, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !25)
!20 = !DISubroutineType(types: !21)
!21 = !{null, !22}
!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
!23 = !DIDerivedType(tag: DW_TAG_atomic_type, baseType: !24)
!24 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!25 = !{!26}
!26 = !DILocalVariable(name: "i", arg: 1, scope: !19, file: !1, line: 3, type: !22)
```

All the above suggests _Atomic behaves like a modifier (e.g. const, restrict, volatile).
This seems true based on doc [1].

Without proper handling DW_TAG_atomic_type, llvm BTF generation will be incorrect since
the current implementation assumes no existence of DW_TAG_atomic_type. So we have
two choices here:
  (1). llvm bpf backend processes DW_TAG_atomic_type but ignores it in BTF encoding.
  (2). Add another type, e.g., BTF_KIND_ATOMIC to BTF. BTF_KIND_ATOMIC behaves as a
       modifier like const/volatile/restrict.

For choice (1), the following is a hack which can make '-g' work for test1.c:
```

diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 4d847abea731..fd61bb811111 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -1444,8 +1444,14 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
       DIGlobal = GVE->getVariable();
       if (SecName.starts_with(".maps"))
         visitMapDefType(DIGlobal->getType(), GVTypeId);
-      else
-        visitTypeEntry(DIGlobal->getType(), GVTypeId, false, false);
+      else {
+        const DIType *Ty = DIGlobal->getType();
+        auto *DTy = dyn_cast<DIDerivedType>(Ty);
+        if (DTy && DTy->getTag() == dwarf::DW_TAG_atomic_type)
+          visitTypeEntry(DTy->getBaseType(), GVTypeId, false, false);
+        else
+          visitTypeEntry(Ty, GVTypeId, false, false);
+      }
       break;
     }
```
You can see that basicaly dwarf::DW_TAG_atomic_type is skipped during BTF generation.
Other changes are needed to avoid other usages of dwarf::DW_TAG_atomic_type.

But I prefer adding BTF_KIND_ATOMIC if we indeed intends to use _Atomic in bpf programs.
This probably is the only way if the _Atomic type is used e.g. at global variable
where corresponding types in skeleton needs also be _Atomic.

Please let me know your opinion.

 [1] https://lore.kernel.org/bpf/7b941f53-2a05-48ec-9032-8f106face3a3@linux.dev/
 [2] https://dwarfstd.org/issues/131112.1.html
---
 llvm/lib/Target/BPF/BPFInstrInfo.td   | 113 +++++++++++++++++++++++---
 llvm/lib/Target/BPF/BPFMIChecking.cpp |  91 ++++++++++++++++++---
 2 files changed, 183 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index f7e17901c7ed5e..68b0d1b70efe20 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -864,26 +864,119 @@ class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
 
 let Constraints = "$dst = $val" in {
   let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
-    def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32>;
-    def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_i32>;
-    def XFORW32  : XFALU32<BPF_W, BPF_OR,  "u32", "or",  atomic_load_or_i32>;
-    def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_i32>;
+    def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32_seq_cst>;
+    def XFANDW32 : XFALU32<BPF_W, BPF_AND, "u32", "and", atomic_load_and_i32_seq_cst>;
+    def XFORW32  : XFALU32<BPF_W, BPF_OR,  "u32", "or",  atomic_load_or_i32_seq_cst>;
+    def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_i32_seq_cst>;
   }
 
   let Predicates = [BPFHasALU32] in {
-    def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64>;
+    def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64_seq_cst>;
   }
-  def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_i64>;
-  def XFORD  : XFALU64<BPF_DW, BPF_OR,  "u64", "or",  atomic_load_or_i64>;
-  def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_i64>;
+  def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_i64_seq_cst>;
+  def XFORD  : XFALU64<BPF_DW, BPF_OR,  "u64", "or",  atomic_load_or_i64_seq_cst>;
+  def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_i64_seq_cst>;
+}
+
+let Predicates = [BPFHasALU32] in {
+    def : Pat<(atomic_load_add_i32_monotonic ADDRri:$addr, GPR32:$val),
+              (XADDW32 ADDRri:$addr, GPR32:$val)>;
+    def : Pat<(atomic_load_add_i32_acquire ADDRri:$addr, GPR32:$val),
+              (XFADDW32 ADDRri:$addr, GPR32:$val)>;
+    def : Pat<(atomic_load_add_i32_release ADDRri:$addr, GPR32:$val),
+              (XFADDW32 ADDRri:$addr, GPR32:$val)>;
+    def : Pat<(atomic_load_add_i32_acq_rel ADDRri:$addr, GPR32:$val),
+              (XFADDW32 ADDRri:$addr, GPR32:$val)>;
+
+    def : Pat<(atomic_load_add_i64_monotonic ADDRri:$addr, GPR:$val),
+              (XADDD ADDRri:$addr, GPR:$val)>;
+    def : Pat<(atomic_load_add_i64_acquire ADDRri:$addr, GPR:$val),
+              (XFADDD ADDRri:$addr, GPR:$val)>;
+    def : Pat<(atomic_load_add_i64_release ADDRri:$addr, GPR:$val),
+              (XFADDD ADDRri:$addr, GPR:$val)>;
+    def : Pat<(atomic_load_add_i64_acq_rel ADDRri:$addr, GPR:$val),
+              (XFADDD ADDRri:$addr, GPR:$val)>;
 }
 
 // atomic_load_sub can be represented as a neg followed
 // by an atomic_load_add.
-def : Pat<(atomic_load_sub_i32 ADDRri:$addr, GPR32:$val),
+// FIXME: the below can probably be simplified.
+def : Pat<(atomic_load_sub_i32_monotonic ADDRri:$addr, GPR32:$val),
+          (XADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
+def : Pat<(atomic_load_sub_i32_acquire ADDRri:$addr, GPR32:$val),
+          (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
+def : Pat<(atomic_load_sub_i32_release ADDRri:$addr, GPR32:$val),
+          (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
+def : Pat<(atomic_load_sub_i32_acq_rel ADDRri:$addr, GPR32:$val),
+          (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
+def : Pat<(atomic_load_sub_i32_seq_cst ADDRri:$addr, GPR32:$val),
           (XFADDW32 ADDRri:$addr, (NEG_32 GPR32:$val))>;
-def : Pat<(atomic_load_sub_i64 ADDRri:$addr, GPR:$val),
+
+def : Pat<(atomic_load_sub_i64_monotonic ADDRri:$addr, GPR:$val),
+          (XADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+def : Pat<(atomic_load_sub_i64_acquire ADDRri:$addr, GPR:$val),
+          (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+def : Pat<(atomic_load_sub_i64_release ADDRri:$addr, GPR:$val),
+          (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+def : Pat<(atomic_load_sub_i64_acq_rel ADDRri:$addr, GPR:$val),
           (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+def : Pat<(atomic_load_sub_i64_seq_cst ADDRri:$addr, GPR:$val),
+          (XFADDD ADDRri:$addr, (NEG_64 GPR:$val))>;
+
+def : Pat<(atomic_load_and_i32_monotonic ADDRri:$addr, GPR32:$val),
+          (XANDW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_and_i32_acquire ADDRri:$addr, GPR32:$val),
+          (XFANDW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_and_i32_release ADDRri:$addr, GPR32:$val),
+          (XFANDW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_and_i32_acq_rel ADDRri:$addr, GPR32:$val),
+          (XFANDW32 ADDRri:$addr, GPR32:$val)>;
+
+
+def : Pat<(atomic_load_and_i64_monotonic ADDRri:$addr, GPR:$val),
+          (XANDD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_and_i64_acquire ADDRri:$addr, GPR:$val),
+          (XFANDD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_and_i64_release ADDRri:$addr, GPR:$val),
+          (XFANDD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_and_i64_acq_rel ADDRri:$addr, GPR:$val),
+          (XFANDD ADDRri:$addr, GPR:$val)>;
+
+def : Pat<(atomic_load_or_i32_monotonic ADDRri:$addr, GPR32:$val),
+          (XORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_or_i32_acquire ADDRri:$addr, GPR32:$val),
+          (XFORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_or_i32_release ADDRri:$addr, GPR32:$val),
+          (XFORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_or_i32_acq_rel ADDRri:$addr, GPR32:$val),
+          (XFORW32 ADDRri:$addr, GPR32:$val)>;
+
+def : Pat<(atomic_load_or_i64_monotonic ADDRri:$addr, GPR:$val),
+          (XORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_or_i64_acquire ADDRri:$addr, GPR:$val),
+          (XFORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_or_i64_release ADDRri:$addr, GPR:$val),
+          (XFORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_or_i64_acq_rel ADDRri:$addr, GPR:$val),
+          (XFORD ADDRri:$addr, GPR:$val)>;
+
+def : Pat<(atomic_load_xor_i32_monotonic ADDRri:$addr, GPR32:$val),
+          (XXORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_xor_i32_acquire ADDRri:$addr, GPR32:$val),
+          (XFXORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_xor_i32_release ADDRri:$addr, GPR32:$val),
+          (XFXORW32 ADDRri:$addr, GPR32:$val)>;
+def : Pat<(atomic_load_xor_i32_acq_rel ADDRri:$addr, GPR32:$val),
+          (XFXORW32 ADDRri:$addr, GPR32:$val)>;
+
+def : Pat<(atomic_load_xor_i64_monotonic ADDRri:$addr, GPR:$val),
+          (XXORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_xor_i64_acquire ADDRri:$addr, GPR:$val),
+          (XFXORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_xor_i64_release ADDRri:$addr, GPR:$val),
+          (XFXORD ADDRri:$addr, GPR:$val)>;
+def : Pat<(atomic_load_xor_i64_acq_rel ADDRri:$addr, GPR:$val),
+          (XFXORD ADDRri:$addr, GPR:$val)>;
 
 // Atomic Exchange
 class XCHG<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp
index 24224f6c1e9e66..6010539d21bad0 100644
--- a/llvm/lib/Target/BPF/BPFMIChecking.cpp
+++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp
@@ -43,14 +43,14 @@ struct BPFMIPreEmitChecking : public MachineFunctionPass {
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
-  void processAtomicInsts();
+  bool processAtomicInsts();
 
 public:
   // Main entry point for this pass.
   bool runOnMachineFunction(MachineFunction &MF) override {
     if (!skipFunction(MF.getFunction())) {
       initialize(MF);
-      processAtomicInsts();
+      return processAtomicInsts();
     }
     return false;
   }
@@ -152,22 +152,91 @@ static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
   return false;
 }
 
-void BPFMIPreEmitChecking::processAtomicInsts() {
+bool BPFMIPreEmitChecking::processAtomicInsts() {
+  if (!MF->getSubtarget<BPFSubtarget>().getHasJmp32()) {
+    // Only check for cpu version 1 and 2.
+    for (MachineBasicBlock &MBB : *MF) {
+      for (MachineInstr &MI : MBB) {
+        if (MI.getOpcode() != BPF::XADDW && MI.getOpcode() != BPF::XADDD)
+          continue;
+
+        LLVM_DEBUG(MI.dump());
+        if (hasLiveDefs(MI, TRI)) {
+          DebugLoc Empty;
+          const DebugLoc &DL = MI.getDebugLoc();
+          const Function &F = MF->getFunction();
+          F.getContext().diagnose(DiagnosticInfoUnsupported{
+              F, "Invalid usage of the XADD return value", DL});
+        }
+      }
+    }
+  }
+
+  // Check return values of atomic_fetch_and_{add,and,or,xor}.
+  // If the return is not used, the atomic_fetch_and_<op> instruction
+  // is replaced with atomic_<op> instruction.
+  MachineInstr *ToErase = nullptr;
+  bool Changed = false;
+  const BPFInstrInfo *TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
   for (MachineBasicBlock &MBB : *MF) {
     for (MachineInstr &MI : MBB) {
-      if (MI.getOpcode() != BPF::XADDW && MI.getOpcode() != BPF::XADDD)
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      if (MI.getOpcode() != BPF::XADDW32 && MI.getOpcode() != BPF::XADDD &&
+          MI.getOpcode() != BPF::XANDW32 && MI.getOpcode() != BPF::XANDD &&
+          MI.getOpcode() != BPF::XXORW32 && MI.getOpcode() != BPF::XXORD &&
+          MI.getOpcode() != BPF::XORW32 && MI.getOpcode() != BPF::XORD)
         continue;
 
-      LLVM_DEBUG(MI.dump());
-      if (hasLiveDefs(MI, TRI)) {
-        DebugLoc Empty;
-        const DebugLoc &DL = MI.getDebugLoc();
-        const Function &F = MF->getFunction();
-        F.getContext().diagnose(DiagnosticInfoUnsupported{
-            F, "Invalid usage of the XADD return value", DL});
+      if (!hasLiveDefs(MI, TRI))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Transforming "; MI.dump());
+      unsigned newOpcode;
+      switch (MI.getOpcode()) {
+      case BPF::XADDW32:
+        newOpcode = BPF::XFADDW32;
+        break;
+      case BPF::XADDD:
+        newOpcode = BPF::XFADDD;
+        break;
+      case BPF::XANDW32:
+        newOpcode = BPF::XFANDW32;
+        break;
+      case BPF::XANDD:
+        newOpcode = BPF::XFANDD;
+        break;
+      case BPF::XXORW32:
+        newOpcode = BPF::XFXORW32;
+        break;
+      case BPF::XXORD:
+        newOpcode = BPF::XFXORD;
+        break;
+      case BPF::XORW32:
+        newOpcode = BPF::XFORW32;
+        break;
+      case BPF::XORD:
+        newOpcode = BPF::XFORD;
+        break;
+      default:
+        llvm_unreachable("Incorrect Atomic Instruction Opcode");
       }
+
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(newOpcode))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(1))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(3));
+
+      ToErase = &MI;
+      Changed = true;
     }
   }
+
+  return Changed;
 }
 
 } // namespace