[llvm] [X86] Manage atomic load of fp -> int promotion in DAG (PR #118793)

Tue Dec 17 14:00:33 PST 2024

https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/118793

>From 9b0a33b3f4e9d7da6e755e36722e72396fb9f064 Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Mon, 25 Nov 2024 15:05:53 -0500
Subject: [PATCH 01/10] [Verifier] Allow vector type in atomic load and store

Vector types on atomics are assumed to be invalid by the verifier. However,
this type can be valid if it is lowered by codegen.
---
 llvm/docs/LangRef.rst         |  8 ++++----
 llvm/lib/IR/Verifier.cpp      | 14 ++++++++------
 llvm/test/Verifier/atomics.ll | 15 ++++++++-------
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 79bdd25c18f1fd..32ba5ebdec6d37 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -10956,8 +10956,8 @@ If the ``load`` is marked as ``atomic``, it takes an extra :ref:`ordering
 <ordering>` and optional ``syncscope("<target-scope>")`` argument. The
 ``release`` and ``acq_rel`` orderings are not valid on ``load`` instructions.
 Atomic loads produce :ref:`defined <memmodel>` results when they may see
-multiple atomic stores. The type of the pointee must be an integer, pointer, or
-floating-point type whose bit width is a power of two greater than or equal to
+multiple atomic stores. The type of the pointee must be an integer, pointer,
+floating-point, or vector type whose bit width is a power of two greater than or equal to
 eight and less than or equal to a target-specific size limit.  ``align`` must be
 explicitly specified on atomic loads. Note: if the alignment is not greater or
 equal to the size of the `<value>` type, the atomic operation is likely to
@@ -11097,8 +11097,8 @@ If the ``store`` is marked as ``atomic``, it takes an extra :ref:`ordering
 <ordering>` and optional ``syncscope("<target-scope>")`` argument. The
 ``acquire`` and ``acq_rel`` orderings aren't valid on ``store`` instructions.
 Atomic loads produce :ref:`defined <memmodel>` results when they may see
-multiple atomic stores. The type of the pointee must be an integer, pointer, or
-floating-point type whose bit width is a power of two greater than or equal to
+multiple atomic stores. The type of the pointee must be an integer, pointer,
+floating-point, or vector type whose bit width is a power of two greater than or equal to
 eight and less than or equal to a target-specific size limit.  ``align`` must be
 explicitly specified on atomic stores. Note: if the alignment is not greater or
 equal to the size of the `<value>` type, the atomic operation is likely to
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 55de486e90e190..6f847e3b3fc70c 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4255,9 +4255,10 @@ void Verifier::visitLoadInst(LoadInst &LI) {
     Check(LI.getOrdering() != AtomicOrdering::Release &&
               LI.getOrdering() != AtomicOrdering::AcquireRelease,
           "Load cannot have Release ordering", &LI);
-    Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
-          "atomic load operand must have integer, pointer, or floating point "
-          "type!",
+    Check(ElTy->getScalarType()->isIntOrPtrTy() ||
+              ElTy->getScalarType()->isFloatingPointTy(),
+          "atomic load operand must have integer, pointer, floating point, "
+          "or vector type!",
           ElTy, &LI);
     checkAtomicMemAccessSize(ElTy, &LI);
   } else {
@@ -4281,9 +4282,10 @@ void Verifier::visitStoreInst(StoreInst &SI) {
     Check(SI.getOrdering() != AtomicOrdering::Acquire &&
               SI.getOrdering() != AtomicOrdering::AcquireRelease,
           "Store cannot have Acquire ordering", &SI);
-    Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
-          "atomic store operand must have integer, pointer, or floating point "
-          "type!",
+    Check(ElTy->getScalarType()->isIntOrPtrTy() ||
+              ElTy->getScalarType()->isFloatingPointTy(),
+          "atomic store operand must have integer, pointer, floating point, "
+          "or vector type!",
           ElTy, &SI);
     checkAtomicMemAccessSize(ElTy, &SI);
   } else {
diff --git a/llvm/test/Verifier/atomics.ll b/llvm/test/Verifier/atomics.ll
index f835b98b243456..17bf5a0528d738 100644
--- a/llvm/test/Verifier/atomics.ll
+++ b/llvm/test/Verifier/atomics.ll
@@ -1,14 +1,15 @@
 ; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+; CHECK: atomic store operand must have integer, pointer, floating point, or vector type!
+; CHECK: atomic load operand must have integer, pointer, floating point, or vector type!
 
-; CHECK: atomic store operand must have integer, pointer, or floating point type!
-; CHECK: atomic load operand must have integer, pointer, or floating point type!
+%ty = type { i32 };
 
-define void @foo(ptr %P, <1 x i64> %v) {
-  store atomic <1 x i64> %v, ptr %P unordered, align 8
+define void @foo(ptr %P, %ty %v) {
+  store atomic %ty %v, ptr %P unordered, align 8
   ret void
 }
 
-define <1 x i64> @bar(ptr %P) {
-  %v = load atomic <1 x i64>, ptr %P unordered, align 8
-  ret <1 x i64> %v
+define %ty @bar(ptr %P) {
+  %v = load atomic %ty, ptr %P unordered, align 8
+  ret %ty %v
 }

>From aba2a030efe870593a3fc3d8ccf6eace7c70522c Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Wed, 4 Dec 2024 13:30:12 -0500
Subject: [PATCH 02/10] Update Assembler/atomic test

---
 llvm/test/Assembler/atomic.ll | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/llvm/test/Assembler/atomic.ll b/llvm/test/Assembler/atomic.ll
index a44dcccc16bef1..f1027d5d3fbde4 100644
--- a/llvm/test/Assembler/atomic.ll
+++ b/llvm/test/Assembler/atomic.ll
@@ -52,6 +52,15 @@ define void @f(ptr %x) {
   ; CHECK: atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
   atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
 
+  ; CHECK : load atomic <1 x i32>, ptr %x unordered, align 4
+  load atomic <1 x i32>, ptr %x unordered, align 4
+  ; CHECK : store atomic <1 x i32> splat (i32 3), ptr %x release, align 4
+  store atomic <1 x i32> <i32 3>, ptr %x release, align 4
+  ; CHECK : load atomic <2 x i32>, ptr %x unordered, align 4
+  load atomic <2 x i32>, ptr %x unordered, align 4
+  ; CHECK : store atomic <2 x i32> <i32 3, i32 4>, ptr %x release, align 4
+  store atomic <2 x i32> <i32 3, i32 4>, ptr %x release, align 4
+
   ; CHECK: fence syncscope("singlethread") release
   fence syncscope("singlethread") release
   ; CHECK: fence seq_cst

>From 10b57a1b1ee586ac492657fd07713ee5b75e1908 Mon Sep 17 00:00:00 2001
From: jofernau <Joe.Fernau at amd.com>
Date: Mon, 7 Oct 2024 13:47:50 -0400
Subject: [PATCH 03/10] [SelectionDAG] Legalize vector types for atomic load

Scalarize vector of atomic load in SelectionDAG.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h   |  1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp        | 16 ++++++++++++++++
 .../CodeGen/Generic/atomic-scalarization.ll     | 17 +++++++++++++++++
 3 files changed, 34 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/atomic-scalarization.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 1703149aca7463..0086405825cd5c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -860,6 +860,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue ScalarizeVecRes_ExpOp(SDNode *N);
   SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
+  SDValue ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N);
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_VSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 465128099f4447..eb3841f48c0e5a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -60,6 +60,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
   case ISD::FPOWI:             R = ScalarizeVecRes_ExpOp(N); break;
   case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::ATOMIC_LOAD:
+    R = ScalarizeVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N));
+    break;
   case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
   case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
   case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
@@ -451,6 +454,19 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
   return Op;
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N) {
+
+  SDValue Result = DAG.getAtomic(ISD::ATOMIC_LOAD, SDLoc(N),
+      N->getMemoryVT().getVectorElementType(),
+      N->getValueType(0).getVectorElementType(),
+      N->getChain(), N->getBasePtr(), N->getMemOperand());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+  return Result;
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
   assert(N->isUnindexed() && "Indexed vector load?");
 
diff --git a/llvm/test/CodeGen/Generic/atomic-scalarization.ll b/llvm/test/CodeGen/Generic/atomic-scalarization.ll
new file mode 100644
index 00000000000000..3f611323172884
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/atomic-scalarization.ll
@@ -0,0 +1,17 @@
+; RUN: llc %s --print-after-isel --disable-verify 2>&1 | FileCheck %s
+
+define i32 @atomic_scalar() {
+; CHECK: # After Instruction Selection:
+; CHECK-NEXT: # Machine code for function atomic_scalar: IsSSA, TracksLiveness
+; CHECK-NEXT: Frame Objects:
+; CHECK-NEXT:   fi#0: size=4, align=4, at location [SP+8]
+; CHECK:      bb.0 (%ir-block.0):
+; CHECK-NEXT:   %0:gr32 = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (dereferenceable load acquire (s32) from %ir.1)
+; CHECK-NEXT:   $eax = COPY %0:gr32
+; CHECK-NEXT:   RET 0, $eax
+; CHECK:      # End machine code for function atomic_scalar.
+  %1 = alloca <1 x i32>
+  %2 = load atomic <1 x i32>, ptr %1 acquire, align 4
+  %3 = extractelement <1 x i32> %2, i32 0
+  ret i32 %3
+}

>From a95ff19bde0aea2ca084fbe8398c9099c45d4b1b Mon Sep 17 00:00:00 2001
From: jofernau <Joe.Fernau at amd.com>
Date: Mon, 7 Oct 2024 14:18:52 -0400
Subject: [PATCH 04/10] Moved test and checking mir after last X86 pass

---
 .../CodeGen/Generic/atomic-scalarization.ll     | 17 -----------------
 llvm/test/CodeGen/X86/atomic-scalarization.ll   | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 17 deletions(-)
 delete mode 100644 llvm/test/CodeGen/Generic/atomic-scalarization.ll
 create mode 100644 llvm/test/CodeGen/X86/atomic-scalarization.ll

diff --git a/llvm/test/CodeGen/Generic/atomic-scalarization.ll b/llvm/test/CodeGen/Generic/atomic-scalarization.ll
deleted file mode 100644
index 3f611323172884..00000000000000
--- a/llvm/test/CodeGen/Generic/atomic-scalarization.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc %s --print-after-isel --disable-verify 2>&1 | FileCheck %s
-
-define i32 @atomic_scalar() {
-; CHECK: # After Instruction Selection:
-; CHECK-NEXT: # Machine code for function atomic_scalar: IsSSA, TracksLiveness
-; CHECK-NEXT: Frame Objects:
-; CHECK-NEXT:   fi#0: size=4, align=4, at location [SP+8]
-; CHECK:      bb.0 (%ir-block.0):
-; CHECK-NEXT:   %0:gr32 = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (dereferenceable load acquire (s32) from %ir.1)
-; CHECK-NEXT:   $eax = COPY %0:gr32
-; CHECK-NEXT:   RET 0, $eax
-; CHECK:      # End machine code for function atomic_scalar.
-  %1 = alloca <1 x i32>
-  %2 = load atomic <1 x i32>, ptr %1 acquire, align 4
-  %3 = extractelement <1 x i32> %2, i32 0
-  ret i32 %3
-}
diff --git a/llvm/test/CodeGen/X86/atomic-scalarization.ll b/llvm/test/CodeGen/X86/atomic-scalarization.ll
new file mode 100644
index 00000000000000..22d240bd855180
--- /dev/null
+++ b/llvm/test/CodeGen/X86/atomic-scalarization.ll
@@ -0,0 +1,16 @@
+; RUN: llc %s --mtriple=x86_64 -print-after=unpack-mi-bundles -disable-verify 2>&1 | FileCheck %s
+
+define i32 @atomic_scalar() {
+; CHECK: # *** IR Dump After Unpack machine instruction bundles (unpack-mi-bundles) ***:
+; CHECK-NEXT: # Machine code for function atomic_scalar: NoPHIs, TracksLiveness, NoVRegs, TiedOpsRewritten, TracksDebugUserValues
+; CHECK-NEXT: Frame Objects:
+; CHECK-NEXT:   fi#0: size=4, align=4, at location [SP-4]
+; CHECK:      bb.0 (%ir-block.0):
+; CHECK-NEXT:   renamable $eax = MOV32rm $rsp, 1, $noreg, -4, $noreg :: (dereferenceable load acquire (s32) from %ir.1)
+; CHECK-NEXT:   RET64 $eax
+; CHECK:      # End machine code for function atomic_scalar.
+  %1 = alloca <1 x i32>
+  %2 = load atomic <1 x i32>, ptr %1 acquire, align 4
+  %3 = extractelement <1 x i32> %2, i32 0
+  ret i32 %3
+}

>From d9d4a1c3a89d6171be481cc6f03db5e7de56bc28 Mon Sep 17 00:00:00 2001
From: jofernau <Joe.Fernau at amd.com>
Date: Mon, 7 Oct 2024 14:44:00 -0400
Subject: [PATCH 05/10] Autogenerate test.

---
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  8 ++---
 llvm/test/CodeGen/X86/atomic-scalarization.ll | 33 +++++++++++--------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index eb3841f48c0e5a..bdd71b251f3941 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -456,10 +456,10 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N) {
 
-  SDValue Result = DAG.getAtomic(ISD::ATOMIC_LOAD, SDLoc(N),
-      N->getMemoryVT().getVectorElementType(),
-      N->getValueType(0).getVectorElementType(),
-      N->getChain(), N->getBasePtr(), N->getMemOperand());
+  SDValue Result = DAG.getAtomic(
+      ISD::ATOMIC_LOAD, SDLoc(N), N->getMemoryVT().getVectorElementType(),
+      N->getValueType(0).getVectorElementType(), N->getChain(), N->getBasePtr(),
+      N->getMemOperand());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
diff --git a/llvm/test/CodeGen/X86/atomic-scalarization.ll b/llvm/test/CodeGen/X86/atomic-scalarization.ll
index 22d240bd855180..6560924ed99546 100644
--- a/llvm/test/CodeGen/X86/atomic-scalarization.ll
+++ b/llvm/test/CodeGen/X86/atomic-scalarization.ll
@@ -1,16 +1,21 @@
-; RUN: llc %s --mtriple=x86_64 -print-after=unpack-mi-bundles -disable-verify 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s --mtriple=x86_64 -o - | FileCheck %s
 
-define i32 @atomic_scalar() {
-; CHECK: # *** IR Dump After Unpack machine instruction bundles (unpack-mi-bundles) ***:
-; CHECK-NEXT: # Machine code for function atomic_scalar: NoPHIs, TracksLiveness, NoVRegs, TiedOpsRewritten, TracksDebugUserValues
-; CHECK-NEXT: Frame Objects:
-; CHECK-NEXT:   fi#0: size=4, align=4, at location [SP-4]
-; CHECK:      bb.0 (%ir-block.0):
-; CHECK-NEXT:   renamable $eax = MOV32rm $rsp, 1, $noreg, -4, $noreg :: (dereferenceable load acquire (s32) from %ir.1)
-; CHECK-NEXT:   RET64 $eax
-; CHECK:      # End machine code for function atomic_scalar.
-  %1 = alloca <1 x i32>
-  %2 = load atomic <1 x i32>, ptr %1 acquire, align 4
-  %3 = extractelement <1 x i32> %2, i32 0
-  ret i32 %3
+define <1 x i32> @atomic_scalar_i32(ptr %x) {
+; CHECK-LABEL: atomic_scalar_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x i32>, ptr %x acquire, align 4
+  ret <1 x i32> %ret
+}
+
+define <1 x bfloat> @atomic_scalar_bfloat(ptr %x) {
+; CHECK-LABEL: atomic_scalar_bfloat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x bfloat>, ptr %x acquire, align 4
+  ret <1 x bfloat> %ret
 }

>From 78ec95bcdc326e2da9c49f0f0b7004febadd2a14 Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Fri, 6 Dec 2024 16:55:25 -0500
Subject: [PATCH 06/10] Renamed to atomic-vector

---
 .../X86/{atomic-scalarization.ll => atomic-vector.ll}     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
 rename llvm/test/CodeGen/X86/{atomic-scalarization.ll => atomic-vector.ll} (75%)

diff --git a/llvm/test/CodeGen/X86/atomic-scalarization.ll b/llvm/test/CodeGen/X86/atomic-vector.ll
similarity index 75%
rename from llvm/test/CodeGen/X86/atomic-scalarization.ll
rename to llvm/test/CodeGen/X86/atomic-vector.ll
index 6560924ed99546..b676b96f4507cf 100644
--- a/llvm/test/CodeGen/X86/atomic-scalarization.ll
+++ b/llvm/test/CodeGen/X86/atomic-vector.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc %s --mtriple=x86_64 -o - | FileCheck %s
 
-define <1 x i32> @atomic_scalar_i32(ptr %x) {
-; CHECK-LABEL: atomic_scalar_i32:
+define <1 x i32> @atomic_vector_i32(ptr %x) {
+; CHECK-LABEL: atomic_vector_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    retq
@@ -10,8 +10,8 @@ define <1 x i32> @atomic_scalar_i32(ptr %x) {
   ret <1 x i32> %ret
 }
 
-define <1 x bfloat> @atomic_scalar_bfloat(ptr %x) {
-; CHECK-LABEL: atomic_scalar_bfloat:
+define <1 x bfloat> @atomic_vector_bfloat(ptr %x) {
+; CHECK-LABEL: atomic_vector_bfloat:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl (%rdi), %eax
 ; CHECK-NEXT:    pinsrw $0, %eax, %xmm0

>From 009b140c415c3f4bdafe8e123a7021d7297f5fe1 Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Fri, 6 Dec 2024 17:02:12 -0500
Subject: [PATCH 07/10] Add to preexisting test

---
 llvm/test/CodeGen/X86/atomic-load-store.ll | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 5bce4401f7bdb0..2ab97c9660aa25 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,12 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs -O0 | FileCheck %s
+; RUN: llc %s --mtriple=x86_64 -o - | FileCheck %s --check-prefix=CHECK0
 
 define void @test1(ptr %ptr, i32 %val1) {
 ; CHECK-LABEL: test1:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    xchgl %esi, (%rdi)
 ; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: test1:
+; CHECK0:       # %bb.0:
+; CHECK0-NEXT:    xchgl %esi, (%rdi)
+; CHECK0-NEXT:    retq
   store atomic i32 %val1, ptr %ptr seq_cst, align 4
   ret void
 }
@@ -16,6 +22,11 @@ define void @test2(ptr %ptr, i32 %val1) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, (%rdi)
 ; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: test2:
+; CHECK0:       # %bb.0:
+; CHECK0-NEXT:    movl %esi, (%rdi)
+; CHECK0-NEXT:    retq
   store atomic i32 %val1, ptr %ptr release, align 4
   ret void
 }
@@ -25,6 +36,35 @@ define i32 @test3(ptr %ptr) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: test3:
+; CHECK0:       # %bb.0:
+; CHECK0-NEXT:    movl (%rdi), %eax
+; CHECK0-NEXT:    retq
   %val = load atomic i32, ptr %ptr seq_cst, align 4
   ret i32 %val
 }
+
+define <1 x i32> @atomic_scalar_i32(ptr %x) {
+; CHECK-LABEL: atomic_scalar_i32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: atomic_scalar_i32:
+; CHECK0:       # %bb.0:
+; CHECK0-NEXT:    movl (%rdi), %eax
+; CHECK0-NEXT:    retq
+  %ret = load atomic <1 x i32>, ptr %x acquire, align 4
+  ret <1 x i32> %ret
+}
+
+define <1 x bfloat> @atomic_scalar_bfloat(ptr %x) {
+; CHECK0-LABEL: atomic_scalar_bfloat:
+; CHECK0:       # %bb.0:
+; CHECK0-NEXT:    movzwl (%rdi), %eax
+; CHECK0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK0-NEXT:    retq
+  %ret = load atomic <1 x bfloat>, ptr %x acquire, align 4
+  ret <1 x bfloat> %ret
+}

>From f99ae713d0da88f5ca3281720ac992eb488ca692 Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Mon, 9 Dec 2024 10:02:36 -0500
Subject: [PATCH 08/10] Rename tests

---
 llvm/test/CodeGen/X86/atomic-load-store.ll | 35 ++--------------------
 llvm/test/CodeGen/X86/atomic-vector.ll     | 21 -------------
 2 files changed, 2 insertions(+), 54 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/atomic-vector.ll

diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 2ab97c9660aa25..9cac8167542d8b 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,18 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs -O0 | FileCheck %s
-; RUN: llc %s --mtriple=x86_64 -o - | FileCheck %s --check-prefix=CHECK0
 
 define void @test1(ptr %ptr, i32 %val1) {
 ; CHECK-LABEL: test1:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    xchgl %esi, (%rdi)
 ; CHECK-NEXT:    retq
-;
-; CHECK0-LABEL: test1:
-; CHECK0:       # %bb.0:
-; CHECK0-NEXT:    xchgl %esi, (%rdi)
-; CHECK0-NEXT:    retq
   store atomic i32 %val1, ptr %ptr seq_cst, align 4
   ret void
 }
@@ -22,11 +16,6 @@ define void @test2(ptr %ptr, i32 %val1) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, (%rdi)
 ; CHECK-NEXT:    retq
-;
-; CHECK0-LABEL: test2:
-; CHECK0:       # %bb.0:
-; CHECK0-NEXT:    movl %esi, (%rdi)
-; CHECK0-NEXT:    retq
   store atomic i32 %val1, ptr %ptr release, align 4
   ret void
 }
@@ -36,35 +25,15 @@ define i32 @test3(ptr %ptr) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    retq
-;
-; CHECK0-LABEL: test3:
-; CHECK0:       # %bb.0:
-; CHECK0-NEXT:    movl (%rdi), %eax
-; CHECK0-NEXT:    retq
   %val = load atomic i32, ptr %ptr seq_cst, align 4
   ret i32 %val
 }
 
-define <1 x i32> @atomic_scalar_i32(ptr %x) {
-; CHECK-LABEL: atomic_scalar_i32:
+define <1 x i32> @atomic_vec1_i32(ptr %x) {
+; CHECK-LABEL: atomic_vec1_i32:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    retq
-;
-; CHECK0-LABEL: atomic_scalar_i32:
-; CHECK0:       # %bb.0:
-; CHECK0-NEXT:    movl (%rdi), %eax
-; CHECK0-NEXT:    retq
   %ret = load atomic <1 x i32>, ptr %x acquire, align 4
   ret <1 x i32> %ret
 }
-
-define <1 x bfloat> @atomic_scalar_bfloat(ptr %x) {
-; CHECK0-LABEL: atomic_scalar_bfloat:
-; CHECK0:       # %bb.0:
-; CHECK0-NEXT:    movzwl (%rdi), %eax
-; CHECK0-NEXT:    pinsrw $0, %eax, %xmm0
-; CHECK0-NEXT:    retq
-  %ret = load atomic <1 x bfloat>, ptr %x acquire, align 4
-  ret <1 x bfloat> %ret
-}
diff --git a/llvm/test/CodeGen/X86/atomic-vector.ll b/llvm/test/CodeGen/X86/atomic-vector.ll
deleted file mode 100644
index b676b96f4507cf..00000000000000
--- a/llvm/test/CodeGen/X86/atomic-vector.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc %s --mtriple=x86_64 -o - | FileCheck %s
-
-define <1 x i32> @atomic_vector_i32(ptr %x) {
-; CHECK-LABEL: atomic_vector_i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl (%rdi), %eax
-; CHECK-NEXT:    retq
-  %ret = load atomic <1 x i32>, ptr %x acquire, align 4
-  ret <1 x i32> %ret
-}
-
-define <1 x bfloat> @atomic_vector_bfloat(ptr %x) {
-; CHECK-LABEL: atomic_vector_bfloat:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
-; CHECK-NEXT:    retq
-  %ret = load atomic <1 x bfloat>, ptr %x acquire, align 4
-  ret <1 x bfloat> %ret
-}

>From eae47043fc3aaf89c69bb352ee6e430d0bd44f84 Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Thu, 5 Dec 2024 05:09:48 -0500
Subject: [PATCH 09/10] [SelectionDAG][X86] Add floating point promotion.

When lowering atomic vector types with floats, selection can fail since
this pattern is unsupported. To support this, floats can be casted to
an integer type of the same size.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  4 ++
 llvm/test/CodeGen/X86/atomic-scalarization.ll | 40 +++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/atomic-scalarization.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9048d1d83f1874..7f4021f4beedcc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2589,6 +2589,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(Op, MVT::f32, Promote);
   }
 
+  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
+  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
+  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
+
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
                        ISD::SCALAR_TO_VECTOR,
diff --git a/llvm/test/CodeGen/X86/atomic-scalarization.ll b/llvm/test/CodeGen/X86/atomic-scalarization.ll
new file mode 100644
index 00000000000000..dfffb894a87ddb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/atomic-scalarization.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s --mtriple=x86_64 -o - | FileCheck %s
+
+define <1 x i32> @atomic_scalar_i32(ptr %x) {
+; CHECK-LABEL: atomic_scalar_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x i32>, ptr %x acquire, align 4
+  ret <1 x i32> %ret
+}
+
+define <1 x float> @atomic_scalar_float(ptr %x) {
+; CHECK-LABEL: atomic_scalar_float:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x float>, ptr %x acquire, align 4
+  ret <1 x float> %ret
+}
+
+define <1 x half> @atomic_scalar_half(ptr %x) {
+; CHECK-LABEL: atomic_scalar_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x half>, ptr %x acquire, align 4
+  ret <1 x half> %ret
+}
+
+define <1 x bfloat> @atomic_scalar_bfloat(ptr %x) {
+; CHECK-LABEL: atomic_scalar_bfloat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x bfloat>, ptr %x acquire, align 4
+  ret <1 x bfloat> %ret
+}

>From f04b405ff98f45f5edc0d710c32c9aee5a4456bf Mon Sep 17 00:00:00 2001
From: jofrn <jofernau at amd.com>
Date: Tue, 17 Dec 2024 16:59:45 -0500
Subject: [PATCH 10/10] Move vec1_{i32,float,half,bfloat} tests.

---
 llvm/test/CodeGen/X86/atomic-load-store.ll    | 75 ++++++++++++++++++-
 llvm/test/CodeGen/X86/atomic-scalarization.ll | 40 ----------
 2 files changed, 74 insertions(+), 41 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/atomic-scalarization.ll

diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 9cac8167542d8b..2bde0d2ffd06ad 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,12 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs -O0 | FileCheck %s --check-prefix=CHECK0
 
 define void @test1(ptr %ptr, i32 %val1) {
 ; CHECK-LABEL: test1:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    xchgl %esi, (%rdi)
 ; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: test1:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    xchgl %esi, (%rdi)
+; CHECK0-NEXT:    retq
   store atomic i32 %val1, ptr %ptr seq_cst, align 4
   ret void
 }
@@ -16,6 +21,11 @@ define void @test2(ptr %ptr, i32 %val1) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, (%rdi)
 ; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: test2:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movl %esi, (%rdi)
+; CHECK0-NEXT:    retq
   store atomic i32 %val1, ptr %ptr release, align 4
   ret void
 }
@@ -25,6 +35,11 @@ define i32 @test3(ptr %ptr) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: test3:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movl (%rdi), %eax
+; CHECK0-NEXT:    retq
   %val = load atomic i32, ptr %ptr seq_cst, align 4
   ret i32 %val
 }
@@ -34,6 +49,64 @@ define <1 x i32> @atomic_vec1_i32(ptr %x) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: atomic_vec1_i32:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movl (%rdi), %eax
+; CHECK0-NEXT:    retq
   %ret = load atomic <1 x i32>, ptr %x acquire, align 4
   ret <1 x i32> %ret
 }
+
+define <1 x half> @atomic_vec1_half(ptr %x) {
+; CHECK-LABEL: atomic_vec1_half:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: atomic_vec1_half:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movw (%rdi), %cx
+; CHECK0-NEXT:    ## implicit-def: $eax
+; CHECK0-NEXT:    movw %cx, %ax
+; CHECK0-NEXT:    ## implicit-def: $xmm0
+; CHECK0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK0-NEXT:    retq
+  %ret = load atomic <1 x half>, ptr %x acquire, align 4
+  ret <1 x half> %ret
+}
+
+define <1 x float> @atomic_vec1_float(ptr %x) {
+; CHECK-LABEL: atomic_vec1_float:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: atomic_vec1_float:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK0-NEXT:    retq
+  %ret = load atomic <1 x float>, ptr %x acquire, align 4
+  ret <1 x float> %ret
+}
+
+define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
+; CHECK-LABEL: atomic_vec1_bfloat:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    retq
+;
+; CHECK0-LABEL: atomic_vec1_bfloat:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movw (%rdi), %cx
+; CHECK0-NEXT:    ## implicit-def: $eax
+; CHECK0-NEXT:    movw %cx, %ax
+; CHECK0-NEXT:    ## implicit-def: $xmm0
+; CHECK0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK0-NEXT:    retq
+  %ret = load atomic <1 x bfloat>, ptr %x acquire, align 4
+  ret <1 x bfloat> %ret
+}
+
diff --git a/llvm/test/CodeGen/X86/atomic-scalarization.ll b/llvm/test/CodeGen/X86/atomic-scalarization.ll
deleted file mode 100644
index dfffb894a87ddb..00000000000000
--- a/llvm/test/CodeGen/X86/atomic-scalarization.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc %s --mtriple=x86_64 -o - | FileCheck %s
-
-define <1 x i32> @atomic_scalar_i32(ptr %x) {
-; CHECK-LABEL: atomic_scalar_i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl (%rdi), %eax
-; CHECK-NEXT:    retq
-  %ret = load atomic <1 x i32>, ptr %x acquire, align 4
-  ret <1 x i32> %ret
-}
-
-define <1 x float> @atomic_scalar_float(ptr %x) {
-; CHECK-LABEL: atomic_scalar_float:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    retq
-  %ret = load atomic <1 x float>, ptr %x acquire, align 4
-  ret <1 x float> %ret
-}
-
-define <1 x half> @atomic_scalar_half(ptr %x) {
-; CHECK-LABEL: atomic_scalar_half:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
-; CHECK-NEXT:    retq
-  %ret = load atomic <1 x half>, ptr %x acquire, align 4
-  ret <1 x half> %ret
-}
-
-define <1 x bfloat> @atomic_scalar_bfloat(ptr %x) {
-; CHECK-LABEL: atomic_scalar_bfloat:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzwl (%rdi), %eax
-; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
-; CHECK-NEXT:    retq
-  %ret = load atomic <1 x bfloat>, ptr %x acquire, align 4
-  ret <1 x bfloat> %ret
-}