[clang] [llvm] Use unaligned atomic load and stores on x86 (PR #79191)

Tue Jan 23 12:02:29 PST 2024

https://github.com/AtariDreams updated https://github.com/llvm/llvm-project/pull/79191

>From 9d8ca53e5439f838eea7c8d8531cac6c27df2c47 Mon Sep 17 00:00:00 2001
From: Rose <83477269+AtariDreams at users.noreply.github.com>
Date: Tue, 23 Jan 2024 13:59:05 -0500
Subject: [PATCH] Use unaligned atomic load and stores on x86

The backend supports it now, so we can use it.
---
 clang/lib/CodeGen/CGObjC.cpp                  |  5 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  1 +
 llvm/test/CodeGen/X86/unaligned-atomic-ops.ll | 92 +++++++++++++++++++
 3 files changed, 96 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/unaligned-atomic-ops.ll

diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index 03fc0ec7ff54e1c..debfc84f49e4848 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -846,8 +846,9 @@ static void emitStructGetterCall(CodeGenFunction &CGF, ObjCIvarDecl *ivar,
 /// accesses.  They don't have to be fast, just faster than a function
 /// call and a mutex.
 static bool hasUnalignedAtomics(llvm::Triple::ArchType arch) {
-  // FIXME: Allow unaligned atomic load/store on x86.  (It is not
-  // currently supported by the backend.)
+  // x86 is the only one so far that we know support this as of now
+  if (arch == llvm::Triple::x86 || arch == llvm::Triple::x86_64)
+    return true;
   return false;
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e158312caffdec7..9b5128cc1361147 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -107,6 +107,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setSchedulingPreference(Sched::RegPressure);
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+  setSupportsUnalignedAtomics(true);
 
   // Bypass expensive divides and use cheaper ones.
   if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
diff --git a/llvm/test/CodeGen/X86/unaligned-atomic-ops.ll b/llvm/test/CodeGen/X86/unaligned-atomic-ops.ll
new file mode 100644
index 000000000000000..9e5173ff2b37e61
--- /dev/null
+++ b/llvm/test/CodeGen/X86/unaligned-atomic-ops.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic    | FileCheck -check-prefix=I386 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic  | FileCheck -check-prefix=CORE2 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic  | FileCheck -check-prefix=COREI7 %s
+
+; This verifies that the middle end can handle an unaligned atomic load.
+;
+; In the past, an assertion inside the SelectionDAGBuilder would always
+; hit an assertion for unaligned loads and stores.
+
+%AtomicI16 = type { %CellI16, [0 x i8] }
+%CellI16 = type { i16, [0 x i8] }
+
+; CHECK-LABEL: foo
+; CHECK: ret
+define void @foo(%AtomicI16* %self) {
+; I386-LABEL: foo:
+; I386:       ## %bb.0: ## %start
+; I386-NEXT:    pushl %esi
+; I386-NEXT:    .cfi_def_cfa_offset 8
+; I386-NEXT:    subl $24, %esp
+; I386-NEXT:    .cfi_def_cfa_offset 32
+; I386-NEXT:    .cfi_offset %esi, -8
+; I386-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; I386-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl $5, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl $2, (%esp)
+; I386-NEXT:    calll ___atomic_load
+; I386-NEXT:    movw $5, {{[0-9]+}}(%esp)
+; I386-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; I386-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl $5, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl $2, (%esp)
+; I386-NEXT:    calll ___atomic_store
+; I386-NEXT:    addl $24, %esp
+; I386-NEXT:    popl %esi
+; I386-NEXT:    retl
+;
+; CORE2-LABEL: foo:
+; CORE2:       ## %bb.0: ## %start
+; CORE2-NEXT:    pushq %rbx
+; CORE2-NEXT:    .cfi_def_cfa_offset 16
+; CORE2-NEXT:    subq $16, %rsp
+; CORE2-NEXT:    .cfi_def_cfa_offset 32
+; CORE2-NEXT:    .cfi_offset %rbx, -16
+; CORE2-NEXT:    movq %rdi, %rbx
+; CORE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; CORE2-NEXT:    movl $2, %edi
+; CORE2-NEXT:    movq %rbx, %rsi
+; CORE2-NEXT:    movl $5, %ecx
+; CORE2-NEXT:    callq ___atomic_load
+; CORE2-NEXT:    movw $5, {{[0-9]+}}(%rsp)
+; CORE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; CORE2-NEXT:    movl $2, %edi
+; CORE2-NEXT:    movq %rbx, %rsi
+; CORE2-NEXT:    movl $5, %ecx
+; CORE2-NEXT:    callq ___atomic_store
+; CORE2-NEXT:    addq $16, %rsp
+; CORE2-NEXT:    popq %rbx
+; CORE2-NEXT:    retq
+;
+; COREI7-LABEL: foo:
+; COREI7:       ## %bb.0: ## %start
+; COREI7-NEXT:    pushq %rbx
+; COREI7-NEXT:    .cfi_def_cfa_offset 16
+; COREI7-NEXT:    subq $16, %rsp
+; COREI7-NEXT:    .cfi_def_cfa_offset 32
+; COREI7-NEXT:    .cfi_offset %rbx, -16
+; COREI7-NEXT:    movq %rdi, %rbx
+; COREI7-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; COREI7-NEXT:    movl $2, %edi
+; COREI7-NEXT:    movq %rbx, %rsi
+; COREI7-NEXT:    movl $5, %ecx
+; COREI7-NEXT:    callq ___atomic_load
+; COREI7-NEXT:    movw $5, {{[0-9]+}}(%rsp)
+; COREI7-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; COREI7-NEXT:    movl $2, %edi
+; COREI7-NEXT:    movq %rbx, %rsi
+; COREI7-NEXT:    movl $5, %ecx
+; COREI7-NEXT:    callq ___atomic_store
+; COREI7-NEXT:    addq $16, %rsp
+; COREI7-NEXT:    popq %rbx
+; COREI7-NEXT:    retq
+start:
+  %a = getelementptr inbounds %AtomicI16, %AtomicI16* %self, i16 0, i32 0, i32 0
+  load atomic i16, i16* %a seq_cst, align 1
+  store atomic i16 5, i16* %a seq_cst, align 1
+  ret void
+}