[llvm] [RFC] IR: Support atomicrmw FP ops with vector types (PR #86796)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 27 05:19:07 PDT 2024
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/86796
Allow using atomicrmw fadd, fsub, fmin, and fmax with vectors of floating-point type. AMDGPU supports atomic fadd for <2 x half> and <2 x bfloat> on some targets and address spaces.
Note this only supports the proper floating-point operations; float vector typed xchg is still not supported. cmpxchg still only supports integers, so this inserts bitcasts for the loop expansion.
I have support for fp vector typed xchg, and vector of int/ptr separately implemented but I don't have an immediate need for those beyond feature consistency.
>From 27772cf3777d49d1314d6d5921732a629367922f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 26 Mar 2024 23:45:34 +0530
Subject: [PATCH] [RFC] IR: Support atomicrmw FP ops with vector types
Allow using atomicrmw fadd, fsub, fmin, and fmax with vectors
of floating-point type. AMDGPU supports atomic fadd for <2 x half>
and <2 x bfloat> on some targets and address spaces.
Note this only supports the proper floating-point operations;
float vector typed xchg is still not supported. cmpxchg still only
supports integers, so this inserts bitcasts for the loop expansion.
I have support for fp vector typed xchg, and vector of int/ptr
separately implemented but I don't have an immediate need for those
beyond feature consistency.
---
llvm/docs/LangRef.rst | 11 +-
llvm/lib/AsmParser/LLParser.cpp | 2 +-
llvm/lib/CodeGen/AtomicExpandPass.cpp | 6 +-
llvm/lib/IR/Verifier.cpp | 5 +-
llvm/test/Assembler/atomic.ll | 16 +
.../invalid-atomicrmw-xchg-fp-vector.ll | 7 +
.../AArch64/atomicrmw-fadd-fp-vector.ll | 115 ++
.../CodeGen/X86/atomicrmw-fadd-fp-vector.ll | 84 ++
.../AMDGPU/expand-atomicrmw-fp-vector.ll | 1204 +++++++++++++++++
9 files changed, 1439 insertions(+), 11 deletions(-)
create mode 100644 llvm/test/Assembler/invalid-atomicrmw-xchg-fp-vector.ll
create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
create mode 100644 llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index a4be31576931cd..7adecc305e3cc4 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -10986,11 +10986,12 @@ For most of these operations, the type of '<value>' must be an integer
type whose bit width is a power of two greater than or equal to eight
and less than or equal to a target-specific size limit. For xchg, this
may also be a floating point or a pointer type with the same size constraints
-as integers. For fadd/fsub/fmax/fmin, this must be a floating point type. The
-type of the '``<pointer>``' operand must be a pointer to that type. If
-the ``atomicrmw`` is marked as ``volatile``, then the optimizer is not
-allowed to modify the number or order of execution of this
-``atomicrmw`` with other :ref:`volatile operations <volatile>`.
+as integers. For fadd/fsub/fmax/fmin, this must be a floating-point
+or vector of floating-point type. The type of the '``<pointer>``'
+operand must be a pointer to that type. If the ``atomicrmw`` is marked
+as ``volatile``, then the optimizer is not allowed to modify the
+number or order of execution of this ``atomicrmw`` with other
+:ref:`volatile operations <volatile>`.
Note: if the alignment is not greater or equal to the size of the `<value>`
type, the atomic operation is likely to require a lock and have poor
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index f0be021668afa7..25316e8f67764d 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -8222,7 +8222,7 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
" operand must be an integer, floating point, or pointer type");
}
} else if (IsFP) {
- if (!Val->getType()->isFloatingPointTy()) {
+ if (!Val->getType()->isFPOrFPVectorTy()) {
return error(ValLoc, "atomicrmw " +
AtomicRMWInst::getOperationName(Operation) +
" operand must be a floating point type");
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 894285a7eb2564..78f9690e496e51 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -559,9 +559,9 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
Value *&Success, Value *&NewLoaded) {
Type *OrigTy = NewVal->getType();
- // This code can go away when cmpxchg supports FP types.
+ // This code can go away when cmpxchg supports FP and vector types.
assert(!OrigTy->isPointerTy());
- bool NeedBitcast = OrigTy->isFloatingPointTy();
+ bool NeedBitcast = OrigTy->isFloatingPointTy() || OrigTy->isVectorTy();
if (NeedBitcast) {
IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
NewVal = Builder.CreateBitCast(NewVal, IntTy);
@@ -728,7 +728,7 @@ static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
unsigned ValueSize = DL.getTypeStoreSize(ValueType);
PMV.ValueType = PMV.IntValueType = ValueType;
- if (PMV.ValueType->isFloatingPointTy())
+ if (PMV.ValueType->isFloatingPointTy() || PMV.ValueType->isVectorTy())
PMV.IntValueType =
Type::getIntNTy(Ctx, ValueType->getPrimitiveSizeInBits());
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index e16572540f96c2..aeb0a72bef88ec 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4248,9 +4248,10 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
" operand must have integer or floating point type!",
&RMWI, ElTy);
} else if (AtomicRMWInst::isFPOperation(Op)) {
- Check(ElTy->isFloatingPointTy(),
+ Check(ElTy->isFPOrFPVectorTy(),
"atomicrmw " + AtomicRMWInst::getOperationName(Op) +
- " operand must have floating point type!",
+ " operand must have floating-point or vector of floating-point "
+ "type!",
&RMWI, ElTy);
} else {
Check(ElTy->isIntegerTy(),
diff --git a/llvm/test/Assembler/atomic.ll b/llvm/test/Assembler/atomic.ll
index e967407af14bd1..32fe82ef2268c8 100644
--- a/llvm/test/Assembler/atomic.ll
+++ b/llvm/test/Assembler/atomic.ll
@@ -72,3 +72,19 @@ define void @fp_atomics(ptr %x) {
ret void
}
+
+define void @fp_vector_atomicrmw(ptr %x, <2 x half> %val) {
+ ; CHECK: %atomic.fadd = atomicrmw fadd ptr %x, <2 x half> %val seq_cst
+ %atomic.fadd = atomicrmw fadd ptr %x, <2 x half> %val seq_cst
+
+ ; CHECK: %atomic.fsub = atomicrmw fsub ptr %x, <2 x half> %val seq_cst
+ %atomic.fsub = atomicrmw fsub ptr %x, <2 x half> %val seq_cst
+
+ ; CHECK: %atomic.fmax = atomicrmw fmax ptr %x, <2 x half> %val seq_cst
+ %atomic.fmax = atomicrmw fmax ptr %x, <2 x half> %val seq_cst
+
+ ; CHECK: %atomic.fmin = atomicrmw fmin ptr %x, <2 x half> %val seq_cst
+ %atomic.fmin = atomicrmw fmin ptr %x, <2 x half> %val seq_cst
+
+ ret void
+}
diff --git a/llvm/test/Assembler/invalid-atomicrmw-xchg-fp-vector.ll b/llvm/test/Assembler/invalid-atomicrmw-xchg-fp-vector.ll
new file mode 100644
index 00000000000000..ea523255ee774f
--- /dev/null
+++ b/llvm/test/Assembler/invalid-atomicrmw-xchg-fp-vector.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: error: atomicrmw xchg operand must be an integer, floating point, or pointer type
+define <2 x half> @fp_vector_atomicrmw(ptr %x, <2 x half> %val) {
+ %atomic.xchg = atomicrmw xchg ptr %x, <2 x half> %val seq_cst
+ ret <2 x half> %atomic.xchg
+}
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
new file mode 100644
index 00000000000000..a7539ac3cce802
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-- -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,NOLSE %s
+; RUN: llc -mtriple=aarch64-- -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,LSE %s
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_align4:
+; NOLSE: // %bb.0:
+; NOLSE-NEXT: fcvtl v1.4s, v0.4h
+; NOLSE-NEXT: ldr s0, [x0]
+; NOLSE-NEXT: b .LBB0_2
+; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start
+; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1
+; NOLSE-NEXT: fmov s0, w10
+; NOLSE-NEXT: cmp w10, w9
+; NOLSE-NEXT: b.eq .LBB0_5
+; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start
+; NOLSE-NEXT: // =>This Loop Header: Depth=1
+; NOLSE-NEXT: // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT: fcvtl v2.4s, v0.4h
+; NOLSE-NEXT: fmov w9, s0
+; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s
+; NOLSE-NEXT: fcvtn v2.4h, v2.4s
+; NOLSE-NEXT: fmov w8, s2
+; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start
+; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1
+; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT: ldaxr w10, [x0]
+; NOLSE-NEXT: cmp w10, w9
+; NOLSE-NEXT: b.ne .LBB0_1
+; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2
+; NOLSE-NEXT: stlxr wzr, w8, [x0]
+; NOLSE-NEXT: cbnz wzr, .LBB0_3
+; NOLSE-NEXT: b .LBB0_1
+; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT: ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_v2f16_align4:
+; LSE: // %bb.0:
+; LSE-NEXT: fcvtl v1.4s, v0.4h
+; LSE-NEXT: ldr s0, [x0]
+; LSE-NEXT: .LBB0_1: // %atomicrmw.start
+; LSE-NEXT: // =>This Inner Loop Header: Depth=1
+; LSE-NEXT: fcvtl v2.4s, v0.4h
+; LSE-NEXT: fmov w8, s0
+; LSE-NEXT: mov w10, w8
+; LSE-NEXT: fadd v2.4s, v2.4s, v1.4s
+; LSE-NEXT: fcvtn v2.4h, v2.4s
+; LSE-NEXT: fmov w9, s2
+; LSE-NEXT: casal w10, w9, [x0]
+; LSE-NEXT: fmov s0, w10
+; LSE-NEXT: cmp w10, w8
+; LSE-NEXT: b.ne .LBB0_1
+; LSE-NEXT: // %bb.2: // %atomicrmw.end
+; LSE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; LSE-NEXT: ret
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4
+ ret <2 x half> %res
+}
+
+define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_align8:
+; NOLSE: // %bb.0:
+; NOLSE-NEXT: ldr d1, [x0]
+; NOLSE-NEXT: b .LBB1_2
+; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start
+; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1
+; NOLSE-NEXT: fmov d1, x10
+; NOLSE-NEXT: cmp x10, x9
+; NOLSE-NEXT: b.eq .LBB1_5
+; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start
+; NOLSE-NEXT: // =>This Loop Header: Depth=1
+; NOLSE-NEXT: // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT: fadd v2.2s, v1.2s, v0.2s
+; NOLSE-NEXT: fmov x9, d1
+; NOLSE-NEXT: fmov x8, d2
+; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start
+; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1
+; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT: ldaxr x10, [x0]
+; NOLSE-NEXT: cmp x10, x9
+; NOLSE-NEXT: b.ne .LBB1_1
+; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2
+; NOLSE-NEXT: stlxr wzr, x8, [x0]
+; NOLSE-NEXT: cbnz wzr, .LBB1_3
+; NOLSE-NEXT: b .LBB1_1
+; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT: fmov d0, d1
+; NOLSE-NEXT: ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_v2f32_align8:
+; LSE: // %bb.0:
+; LSE-NEXT: ldr d1, [x0]
+; LSE-NEXT: .LBB1_1: // %atomicrmw.start
+; LSE-NEXT: // =>This Inner Loop Header: Depth=1
+; LSE-NEXT: fadd v2.2s, v1.2s, v0.2s
+; LSE-NEXT: fmov x8, d1
+; LSE-NEXT: mov x10, x8
+; LSE-NEXT: fmov x9, d2
+; LSE-NEXT: casal x10, x9, [x0]
+; LSE-NEXT: fmov d1, x10
+; LSE-NEXT: cmp x10, x8
+; LSE-NEXT: b.ne .LBB1_1
+; LSE-NEXT: // %bb.2: // %atomicrmw.end
+; LSE-NEXT: fmov d0, d1
+; LSE-NEXT: ret
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8
+ ret <2 x float> %res
+}
+
+attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
new file mode 100644
index 00000000000000..4f8cd5a52ed4c7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck %s
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
+; CHECK-LABEL: test_atomicrmw_fadd_v2f16_align4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $88, %rsp
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: psrld $16, %xmm0
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: pinsrw $0, 2(%rdi), %xmm1
+; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_1: # %atomicrmw.start
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: callq __extendhfsf2 at PLT
+; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: callq __extendhfsf2 at PLT
+; CHECK-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-NEXT: movzwl %ax, %ebp
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: callq __extendhfsf2 at PLT
+; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: callq __extendhfsf2 at PLT
+; CHECK-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: callq __truncsfhf2 at PLT
+; CHECK-NEXT: pextrw $0, %xmm0, %ecx
+; CHECK-NEXT: shll $16, %ecx
+; CHECK-NEXT: orl %ebp, %ecx
+; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: pextrw $0, %xmm0, %edx
+; CHECK-NEXT: shll $16, %edx
+; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-NEXT: movzwl %ax, %eax
+; CHECK-NEXT: orl %edx, %eax
+; CHECK-NEXT: lock cmpxchgl %ecx, (%rbx)
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-NEXT: shrl $16, %eax
+; CHECK-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: jne .LBB0_1
+; CHECK-NEXT: # %bb.2: # %atomicrmw.end
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: addq $88, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4
+ ret <2 x half> %res
+}
+
+define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 {
+; CHECK-LABEL: test_atomicrmw_fadd_v2f32_align8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB1_1: # %atomicrmw.start
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq %xmm1, %rax
+; CHECK-NEXT: addps %xmm0, %xmm1
+; CHECK-NEXT: movq %xmm1, %rcx
+; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi)
+; CHECK-NEXT: movq %rax, %xmm1
+; CHECK-NEXT: jne .LBB1_1
+; CHECK-NEXT: # %bb.2: # %atomicrmw.end
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8
+ ret <2 x float> %res
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
new file mode 100644
index 00000000000000..1411890e01dc59
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
@@ -0,0 +1,1204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx900 %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx90a %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx940 %s | FileCheck %s
+
+;---------------------------------------------------------------------
+; atomicrmw fadd
+;---------------------------------------------------------------------
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) {
+; CHECK-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <2 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2
+ ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <2 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2
+ ret <2 x bfloat> %res
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
+; CHECK-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x half> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4
+ ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4
+ ret <2 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x half> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x bfloat> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2
+ ret <4 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x half> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x bfloat> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4
+ ret <4 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x half> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[NEW]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x half>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x bfloat> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[NEW]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8
+ ret <4 x bfloat> %res
+}
+
+define <2 x float> @test_atomicrmw_fadd_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
+; CHECK-LABEL: define <2 x float> @test_atomicrmw_fadd_v2f32_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x float> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[NEW]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <2 x float>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x float> [[TMP5]]
+;
+ %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8
+ ret <2 x float> %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fsub
+;---------------------------------------------------------------------
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) {
+; CHECK-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <2 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2
+ ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <2 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2
+ ret <2 x bfloat> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
+; CHECK-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x half> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4
+ ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4
+ ret <2 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x half> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x bfloat> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2
+ ret <4 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x half> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x bfloat> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4
+ ret <4 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x half> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[NEW]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x half>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x bfloat> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[NEW]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8
+ ret <4 x bfloat> %res
+}
+
+define <2 x float> @test_atomicrmw_fsub_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
+; CHECK-LABEL: define <2 x float> @test_atomicrmw_fsub_v2f32_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub <2 x float> [[LOADED]], [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[NEW]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <2 x float>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x float> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8
+ ret <2 x float> %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fmin
+;---------------------------------------------------------------------
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) {
+; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <2 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2
+ ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <2 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2
+ ret <2 x bfloat> %res
+}
+
+define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
+; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x half> [[TMP6]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4
+ ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4
+ ret <2 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.minnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2
+ ret <4 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.minnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4
+ ret <4 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call <4 x half> @llvm.minnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x half>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[TMP6]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[TMP6]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8
+ ret <4 x bfloat> %res
+}
+
+define <2 x float> @test_atomicrmw_fmin_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
+; CHECK-LABEL: define <2 x float> @test_atomicrmw_fmin_v2f32_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> [[LOADED]], <2 x float> [[VALUE]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <2 x float>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x float> [[TMP6]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8
+ ret <2 x float> %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw fmax
+;---------------------------------------------------------------------
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) {
+; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <2 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2
+ ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <2 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2
+ ret <2 x bfloat> %res
+}
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
+; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x half> [[TMP6]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4
+ ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4
+ ret <2 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.maxnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align2(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2
+ ret <4 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.maxnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[NEWLOADED]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align4(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
+; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4
+ ret <4 x bfloat> %res
+}
+
+define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
+; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call <4 x half> @llvm.maxnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x half>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x half> [[TMP6]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8
+ ret <4 x half> %res
+}
+
+define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
+; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <4 x bfloat> [[TMP6]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8
+ ret <4 x bfloat> %res
+}
+
+define <2 x float> @test_atomicrmw_fmax_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
+; CHECK-LABEL: define <2 x float> @test_atomicrmw_fmax_v2f32_global_agent_align8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[LOADED]], <2 x float> [[VALUE]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[LOADED]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <2 x float>
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: ret <2 x float> [[TMP6]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8
+ ret <2 x float> %res
+}
More information about the llvm-commits
mailing list