[clang] [llvm] [X86] Use unaligned atomic load and stores (PR #79191)

Wed Jan 24 11:38:00 PST 2024

https://github.com/AtariDreams updated https://github.com/llvm/llvm-project/pull/79191

>From 65d41b9da086a7625e63e0a87c7fd8665fa4e154 Mon Sep 17 00:00:00 2001
From: Rose <83477269+AtariDreams at users.noreply.github.com>
Date: Tue, 23 Jan 2024 13:59:05 -0500
Subject: [PATCH] [X86] Use unaligned atomic load and stores

The backend supports it now, so we can use it.
---
 clang/lib/CodeGen/CGObjC.cpp                  |  5 +-
 clang/test/CodeGenObjC/objc_copyStruct.m      |  3 +-
 clang/test/CodeGenObjC/property-aggregate.m   | 15 +--
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  1 +
 llvm/test/CodeGen/X86/unaligned-atomic-ops.ll | 92 +++++++++++++++++++
 5 files changed, 99 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/unaligned-atomic-ops.ll

diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index 03fc0ec7ff54e1c..debfc84f49e4848 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -846,8 +846,9 @@ static void emitStructGetterCall(CodeGenFunction &CGF, ObjCIvarDecl *ivar,
 /// accesses.  They don't have to be fast, just faster than a function
 /// call and a mutex.
 static bool hasUnalignedAtomics(llvm::Triple::ArchType arch) {
-  // FIXME: Allow unaligned atomic load/store on x86.  (It is not
-  // currently supported by the backend.)
+  // x86 is the only one so far that we know support this as of now
+  if (arch == llvm::Triple::x86 || arch == llvm::Triple::x86_64)
+    return true;
   return false;
 }
 
diff --git a/clang/test/CodeGenObjC/objc_copyStruct.m b/clang/test/CodeGenObjC/objc_copyStruct.m
index 7bbad866e2b1fb2..8e52815a308abcb 100644
--- a/clang/test/CodeGenObjC/objc_copyStruct.m
+++ b/clang/test/CodeGenObjC/objc_copyStruct.m
@@ -2,7 +2,7 @@
 // RUN: %clang -target x86_64-apple-ios -fobjc-runtime=ios -Wno-objc-root-class -S -o - -emit-llvm %s | FileCheck %s
 
 struct S {
-  float f, g;
+  double f, g;
 };
 
 @interface I
@@ -13,4 +13,3 @@ @implementation I
 @end
 
 // CHECK: declare {{.*}}void @objc_copyStruct(ptr, ptr, i64, i1, i1)
-
diff --git a/clang/test/CodeGenObjC/property-aggregate.m b/clang/test/CodeGenObjC/property-aggregate.m
index f4211b6b62bd503..4c8c8893f920f4a 100644
--- a/clang/test/CodeGenObjC/property-aggregate.m
+++ b/clang/test/CodeGenObjC/property-aggregate.m
@@ -1,13 +1,8 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm %s -o - | FileCheck %s
 
-// This structure's size is not a power of two, so the property does
-// not get native atomics, even though x86-64 can do unaligned atomics
-// with a lock prefix.
 struct s3 { char c[3]; };
 
 // This structure's size is, so it does, because it can.
-// FIXME: But we don't at the moment; the backend doesn't know how to generate
-// correct code.
 struct s4 { char c[4]; };
 
 @interface Test0
@@ -18,14 +13,8 @@ @implementation Test0
 @synthesize s3, s4;
 @end
 
-// CHECK: define internal i24 @"\01-[Test0 s3]"(
-// CHECK: call void @objc_copyStruct
 
-// CHECK: define internal void @"\01-[Test0 setS3:]"(
-// CHECK: call void @objc_copyStruct
 
-// CHECK: define internal i32 @"\01-[Test0 s4]"(
-// CHECK: call void @objc_copyStruct
 
-// CHECK: define internal void @"\01-[Test0 setS4:]"(
-// CHECK: call void @objc_copyStruct
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5cc2803b2808797..eacaf6bbd2296ae 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -107,6 +107,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setSchedulingPreference(Sched::RegPressure);
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+  setSupportsUnalignedAtomics(true);
 
   // Bypass expensive divides and use cheaper ones.
   if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
diff --git a/llvm/test/CodeGen/X86/unaligned-atomic-ops.ll b/llvm/test/CodeGen/X86/unaligned-atomic-ops.ll
new file mode 100644
index 000000000000000..9e5173ff2b37e61
--- /dev/null
+++ b/llvm/test/CodeGen/X86/unaligned-atomic-ops.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic    | FileCheck -check-prefix=I386 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic  | FileCheck -check-prefix=CORE2 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic  | FileCheck -check-prefix=COREI7 %s
+
+; This verifies that the middle end can handle an unaligned atomic load.
+;
+; In the past, an assertion inside the SelectionDAGBuilder would always
+; hit an assertion for unaligned loads and stores.
+
+%AtomicI16 = type { %CellI16, [0 x i8] }
+%CellI16 = type { i16, [0 x i8] }
+
+; CHECK-LABEL: foo
+; CHECK: ret
+define void @foo(%AtomicI16* %self) {
+; I386-LABEL: foo:
+; I386:       ## %bb.0: ## %start
+; I386-NEXT:    pushl %esi
+; I386-NEXT:    .cfi_def_cfa_offset 8
+; I386-NEXT:    subl $24, %esp
+; I386-NEXT:    .cfi_def_cfa_offset 32
+; I386-NEXT:    .cfi_offset %esi, -8
+; I386-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; I386-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl $5, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl $2, (%esp)
+; I386-NEXT:    calll ___atomic_load
+; I386-NEXT:    movw $5, {{[0-9]+}}(%esp)
+; I386-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; I386-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl $5, {{[0-9]+}}(%esp)
+; I386-NEXT:    movl $2, (%esp)
+; I386-NEXT:    calll ___atomic_store
+; I386-NEXT:    addl $24, %esp
+; I386-NEXT:    popl %esi
+; I386-NEXT:    retl
+;
+; CORE2-LABEL: foo:
+; CORE2:       ## %bb.0: ## %start
+; CORE2-NEXT:    pushq %rbx
+; CORE2-NEXT:    .cfi_def_cfa_offset 16
+; CORE2-NEXT:    subq $16, %rsp
+; CORE2-NEXT:    .cfi_def_cfa_offset 32
+; CORE2-NEXT:    .cfi_offset %rbx, -16
+; CORE2-NEXT:    movq %rdi, %rbx
+; CORE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; CORE2-NEXT:    movl $2, %edi
+; CORE2-NEXT:    movq %rbx, %rsi
+; CORE2-NEXT:    movl $5, %ecx
+; CORE2-NEXT:    callq ___atomic_load
+; CORE2-NEXT:    movw $5, {{[0-9]+}}(%rsp)
+; CORE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; CORE2-NEXT:    movl $2, %edi
+; CORE2-NEXT:    movq %rbx, %rsi
+; CORE2-NEXT:    movl $5, %ecx
+; CORE2-NEXT:    callq ___atomic_store
+; CORE2-NEXT:    addq $16, %rsp
+; CORE2-NEXT:    popq %rbx
+; CORE2-NEXT:    retq
+;
+; COREI7-LABEL: foo:
+; COREI7:       ## %bb.0: ## %start
+; COREI7-NEXT:    pushq %rbx
+; COREI7-NEXT:    .cfi_def_cfa_offset 16
+; COREI7-NEXT:    subq $16, %rsp
+; COREI7-NEXT:    .cfi_def_cfa_offset 32
+; COREI7-NEXT:    .cfi_offset %rbx, -16
+; COREI7-NEXT:    movq %rdi, %rbx
+; COREI7-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; COREI7-NEXT:    movl $2, %edi
+; COREI7-NEXT:    movq %rbx, %rsi
+; COREI7-NEXT:    movl $5, %ecx
+; COREI7-NEXT:    callq ___atomic_load
+; COREI7-NEXT:    movw $5, {{[0-9]+}}(%rsp)
+; COREI7-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; COREI7-NEXT:    movl $2, %edi
+; COREI7-NEXT:    movq %rbx, %rsi
+; COREI7-NEXT:    movl $5, %ecx
+; COREI7-NEXT:    callq ___atomic_store
+; COREI7-NEXT:    addq $16, %rsp
+; COREI7-NEXT:    popq %rbx
+; COREI7-NEXT:    retq
+start:
+  %a = getelementptr inbounds %AtomicI16, %AtomicI16* %self, i16 0, i32 0, i32 0
+  load atomic i16, i16* %a seq_cst, align 1
+  store atomic i16 5, i16* %a seq_cst, align 1
+  ret void
+}