[llvm-branch-commits] [clang] clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} (PR #96872)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Aug 1 11:11:14 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/96872
>From ea17c792053e32e39a7261e3bdf1673d98e4d94a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 11 Jun 2024 10:58:44 +0200
Subject: [PATCH 1/2] clang/AMDGPU: Emit atomicrmw for
__builtin_amdgcn_global_atomic_fadd_{f32|f64}
Need to emit syncscope and new metadata to get the native instruction,
most of the time.
---
clang/lib/CodeGen/CGBuiltin.cpp | 39 +++++++++++++------
.../CodeGenOpenCL/builtins-amdgcn-gfx11.cl | 2 +-
.../builtins-fp-atomics-gfx12.cl | 4 +-
.../builtins-fp-atomics-gfx90a.cl | 4 +-
.../builtins-fp-atomics-gfx940.cl | 4 +-
5 files changed, 34 insertions(+), 19 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 0c2ee446aa303..02f85f340893d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -58,6 +58,7 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/MatrixBuilder.h"
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/ScopedPrinter.h"
@@ -18776,8 +18777,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
}
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
@@ -18789,18 +18788,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Intrinsic::ID IID;
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
switch (BuiltinID) {
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
- ArgTy = llvm::Type::getFloatTy(getLLVMContext());
- IID = Intrinsic::amdgcn_global_atomic_fadd;
- break;
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getHalfTy(getLLVMContext()), 2);
IID = Intrinsic::amdgcn_global_atomic_fadd;
break;
- case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
- IID = Intrinsic::amdgcn_global_atomic_fadd;
- break;
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
IID = Intrinsic::amdgcn_global_atomic_fmin;
break;
@@ -19223,7 +19215,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
- case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
llvm::AtomicRMWInst::BinOp BinOp;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19239,6 +19233,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
+ case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
BinOp = llvm::AtomicRMWInst::FAdd;
break;
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
@@ -19273,8 +19269,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
EmitScalarExpr(E->getArg(3)), AO, SSID);
} else {
- // The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
- SSID = llvm::SyncScope::System;
+ // Most of the builtins do not have syncscope/order arguments. For DS
+ // atomics the scope doesn't really matter, as they implicitly operate at
+ // workgroup scope.
+ //
+ // The global/flat cases need to use agent scope to consistently produce
+ // the native instruction instead of a cmpxchg expansion.
+ SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
AO = AtomicOrdering::SequentiallyConsistent;
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
@@ -19289,6 +19290,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
if (Volatile)
RMW->setVolatile(true);
+
+ unsigned AddrSpace = Ptr.getType()->getAddressSpace();
+ if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
+ // Most targets require "amdgpu.no.fine.grained.memory" to emit the native
+ // instruction for flat and global operations.
+ llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
+ RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);
+
+ // Most targets require "amdgpu.ignore.denormal.mode" to emit the native
+ // instruction, but this only matters for float fadd.
+ if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
+ RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
+ }
+
return Builder.CreateBitCast(RMW, OrigTy);
}
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
index f444657463568..e8889f57432f5 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {
}
// CHECK-LABEL: @test_global_add_f32
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
void test_global_add_f32(float *rtn, global float *addr, float x) {
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 63381942eaba5..21c1c38bc78dc 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
// CHECK-LABEL: test_local_add_2bf16
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
// GFX12-LABEL: test_local_add_2bf16
@@ -22,7 +22,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
// CHECK-LABEL: test_local_add_2bf16_noret
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
// GFX12-LABEL: test_local_add_2bf16_noret
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index e2117f11858f7..3778f65feaad4 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -9,7 +9,7 @@
typedef half __attribute__((ext_vector_type(2))) half2;
// CHECK-LABEL: test_global_add_f64
-// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %{{.*}}, double %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
// GFX90A-LABEL: test_global_add_f64$local:
// GFX90A: global_atomic_add_f64
void test_global_add_f64(__global double *addr, double x) {
@@ -117,7 +117,7 @@ void test_ds_addf_local_f32(__local float *addr, float x){
}
// CHECK-LABEL: @test_global_add_f32
-// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
void test_global_add_f32(float *rtn, global float *addr, float x) {
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
index 92a33ceac2290..d4ca32b9b3cd6 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -44,7 +44,7 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
// CHECK-LABEL: test_local_add_2bf16
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4{{$}}
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
// GFX940-LABEL: test_local_add_2bf16
@@ -70,7 +70,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
}
// CHECK-LABEL: @test_global_add_f32
-// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
+// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
void test_global_add_f32(float *rtn, global float *addr, float x) {
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
}
>From 81f345d6741a8326367b6b7df9816125f594b713 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 27 Jun 2024 16:24:04 +0200
Subject: [PATCH 2/2] Use monotonic ordering
---
clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl | 2 +-
clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl | 8 ++++----
clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl | 4 ++--
clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl | 8 ++++----
clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl | 8 ++++----
6 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 02f85f340893d..ad4cce77221a6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19276,7 +19276,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
// The global/flat cases need to use agent scope to consistently produce
// the native instruction instead of a cmpxchg expansion.
SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
- AO = AtomicOrdering::SequentiallyConsistent;
+ AO = AtomicOrdering::Monotonic;
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
index e8889f57432f5..138616ccca718 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {
}
// CHECK-LABEL: @test_global_add_f32
-// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
void test_global_add_f32(float *rtn, global float *addr, float x) {
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
index 21c1c38bc78dc..6b8a6d14575db 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
// CHECK-LABEL: test_local_add_2bf16
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
// GFX12-LABEL: test_local_add_2bf16
@@ -22,7 +22,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
// CHECK-LABEL: test_local_add_2bf16_noret
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
// GFX12-LABEL: test_local_add_2bf16_noret
@@ -32,7 +32,7 @@ void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
}
// CHECK-LABEL: test_local_add_2f16
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
// GFX12-LABEL: test_local_add_2f16
// GFX12: ds_pk_add_rtn_f16
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
@@ -40,7 +40,7 @@ half2 test_local_add_2f16(__local half2 *addr, half2 x) {
}
// CHECK-LABEL: test_local_add_2f16_noret
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
// GFX12-LABEL: test_local_add_2f16_noret
// GFX12: ds_pk_add_f16
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
index ad4d0b7af3d4b..2f00977ec6014 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
@@ -6,7 +6,7 @@
// REQUIRES: amdgpu-registered-target
// CHECK-LABEL: test_fadd_local
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4
// GFX8-LABEL: test_fadd_local$local:
// GFX8: ds_add_rtn_f32 v2, v0, v1
// GFX8: s_endpgm
@@ -16,7 +16,7 @@ kernel void test_fadd_local(__local float *ptr, float val){
}
// CHECK-LABEL: test_fadd_local_volatile
-// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4
kernel void test_fadd_local_volatile(volatile __local float *ptr, float val){
volatile float *res;
*res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
index 3778f65feaad4..c525c250c937c 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -9,7 +9,7 @@
typedef half __attribute__((ext_vector_type(2))) half2;
// CHECK-LABEL: test_global_add_f64
-// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
// GFX90A-LABEL: test_global_add_f64$local:
// GFX90A: global_atomic_add_f64
void test_global_add_f64(__global double *addr, double x) {
@@ -99,7 +99,7 @@ void test_flat_global_max_f64(__global double *addr, double x){
}
// CHECK-LABEL: test_ds_add_local_f64
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} seq_cst, align 8
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} monotonic, align 8
// GFX90A: test_ds_add_local_f64$local
// GFX90A: ds_add_rtn_f64
void test_ds_add_local_f64(__local double *addr, double x){
@@ -108,7 +108,7 @@ void test_ds_add_local_f64(__local double *addr, double x){
}
// CHECK-LABEL: test_ds_addf_local_f32
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4
// GFX90A-LABEL: test_ds_addf_local_f32$local
// GFX90A: ds_add_rtn_f32
void test_ds_addf_local_f32(__local float *addr, float x){
@@ -117,7 +117,7 @@ void test_ds_addf_local_f32(__local float *addr, float x){
}
// CHECK-LABEL: @test_global_add_f32
-// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
void test_global_add_f32(float *rtn, global float *addr, float x) {
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
index d4ca32b9b3cd6..5481138b9fee4 100644
--- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -44,7 +44,7 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
// CHECK-LABEL: test_local_add_2bf16
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4{{$}}
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4{{$}}
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
// GFX940-LABEL: test_local_add_2bf16
@@ -54,7 +54,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
}
// CHECK-LABEL: test_local_add_2f16
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
// GFX940-LABEL: test_local_add_2f16
// GFX940: ds_pk_add_rtn_f16
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
@@ -62,7 +62,7 @@ half2 test_local_add_2f16(__local half2 *addr, half2 x) {
}
// CHECK-LABEL: test_local_add_2f16_noret
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
// GFX940-LABEL: test_local_add_2f16_noret
// GFX940: ds_pk_add_f16
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
@@ -70,7 +70,7 @@ void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
}
// CHECK-LABEL: @test_global_add_f32
-// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
void test_global_add_f32(float *rtn, global float *addr, float x) {
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
}
More information about the llvm-branch-commits
mailing list